From 6112c14044e65dd82c3835eeef8c07edf3336815 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bruno=20F=C3=A9lix?= <felix19350@gmail.com>
Date: Sun, 14 Jan 2024 18:29:48 +0100
Subject: [PATCH 001/268] Added license header to create_measurements.py (#403)

* Update create_measurements.py

Added license header to the python script to avoid breaking the build.

* Update src/main/python/create_measurements.py

---------

Co-authored-by: Gunnar Morling <gunnar.morling@googlemail.com>
---
 src/main/python/create_measurements.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/main/python/create_measurements.py b/src/main/python/create_measurements.py
index f48972aba..4125828e8 100755
--- a/src/main/python/create_measurements.py
+++ b/src/main/python/create_measurements.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
 
 # Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java
 

From 1fd4712ed35791097fc8481bf23dd8482137c9d8 Mon Sep 17 00:00:00 2001
From: tkosachev <tkosachev@yandex.ru>
Date: Sun, 14 Jan 2024 00:05:50 +0000
Subject: [PATCH 002/268] CalculateAverage_tkosachev Runs 13.5 sec using 8
 cores of i7-1265U laptop with 16 GB RAM.

---
 calculate_average_tkosachev.sh                |  20 ++
 .../onebrc/CalculateAverage_tkosachev.java    | 172 ++++++++++++++++++
 2 files changed, 192 insertions(+)
 create mode 100755 calculate_average_tkosachev.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java

diff --git a/calculate_average_tkosachev.sh b/calculate_average_tkosachev.sh
new file mode 100755
index 000000000..6b4ec6023
--- /dev/null
+++ b/calculate_average_tkosachev.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_tkosachev
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java b/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java
new file mode 100644
index 000000000..cfacfe1f5
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java
@@ -0,0 +1,172 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+public class CalculateAverage_tkosachev {
+
+    private static final String FILE = "./measurements.txt";
+    public static int numThreads = Math.min(Runtime.getRuntime().availableProcessors(), 8);
+
+    private record ResultRow(int min, double mean, int max) {
+        public String toString() {
+            return STR."\{round(min)}/\{round(mean)}/\{round(max)}";
+        }
+
+        private double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+    }
+
+    private static class MeasurementAggregator {
+        private int min = Integer.MAX_VALUE;
+        private int max = Integer.MIN_VALUE;
+        private long sum = 0;
+        private long count = 0;
+
+        public void newValue(int m) {
+            if (m < min) {
+                min = m;
+            }
+            if (m > max) {
+                max = m;
+            }
+            sum += m;
+            count++;
+        }
+
+        public void mergeIn(MeasurementAggregator add) {
+            if (add.min < min) {
+                min = add.min;
+            }
+            if (add.max > max) {
+                max = add.max;
+            }
+            sum += add.sum;
+            count += add.count;
+        }
+    }
+
+    public static void main(String[] args) {
+        Path path = Paths.get(args.length == 0 ? FILE : args[0]);
+
+        Map<String, MeasurementAggregator> total;
+        try (RandomAccessFile aFile = new RandomAccessFile(path.toFile(), "r");
+                ExecutorService executorService = Executors.newFixedThreadPool(numThreads)) {
+            FileChannel inChannel = aFile.getChannel();
+            int numChunks = args.length > 1 ? Integer.parseInt(args[1]) : 100;
+
+            if (inChannel.size() < 1024 * 1024 * 1024) {
+                numThreads = 1;
+                numChunks = 1;
+            }
+
+            List<Future<Map<String, MeasurementAggregator>>> futures = new ArrayList<>(numThreads);
+            int bufferSize = (int) (inChannel.size() / numChunks) + 100;
+            for (int i = 0; i < numChunks; i++) {
+                final int finalI = i;
+                futures.add(executorService.submit(() -> processBuffer(inChannel, bufferSize, finalI)));
+            }
+            executorService.shutdown();
+            total = new HashMap<>();
+            for (Future<Map<String, MeasurementAggregator>> future : futures) {
+                mergeIn(total, future.get());
+            }
+        }
+        catch (IOException | InterruptedException | ExecutionException e) {
+            throw new RuntimeException(e);
+        }
+        printResults(total);
+    }
+
+    private static void mergeIn(Map<String, MeasurementAggregator> total, Map<String, MeasurementAggregator> result) {
+        for (String name : result.keySet()) {
+            MeasurementAggregator totalAggregator = total.computeIfAbsent(name, _ -> new MeasurementAggregator());
+            totalAggregator.mergeIn(result.get(name));
+        }
+    }
+
+    private static Map<String, MeasurementAggregator> processBuffer(FileChannel channel, int bufferSize, int nr) throws IOException {
+        HashMap<String, MeasurementAggregator> aggregatorMap = new HashMap<>();
+        long start = ((long) nr) * bufferSize;
+        long length = Math.min(bufferSize, channel.size() - start);
+        ByteBuffer byteBuffer = channel.map(
+                FileChannel.MapMode.READ_ONLY,
+                start,
+                length);
+        int i = 0;
+        int smcIndex = -1;
+        byte[] buf = new byte[1024];
+        int count = 0;
+        if (nr > 0) {
+            do {
+                i++;
+            } while (byteBuffer.get() != '\n');
+        }
+        while (i < length) {
+            byte b = byteBuffer.get();
+            buf[count] = b;
+            if (b == ';') {
+                smcIndex = count;
+            }
+            count++;
+            if (b == '\n') {
+                String name = new String(buf, 0, smcIndex);
+                int value = fastParse(buf, smcIndex + 1, count - smcIndex - 2);
+                aggregatorMap.computeIfAbsent(name, _ -> new MeasurementAggregator()).newValue(value);
+                count = 0;
+            }
+            i++;
+        }
+
+        return aggregatorMap;
+    }
+
+    private static void printResults(Map<String, MeasurementAggregator> result) {
+        Map<String, ResultRow> measurements = new TreeMap<>();
+        for (Map.Entry<String, MeasurementAggregator> entry : result.entrySet()) {
+            MeasurementAggregator value = entry.getValue();
+            measurements.put(entry.getKey(), new ResultRow(value.min, ((double) value.sum / value.count), value.max));
+        }
+        System.out.println(measurements);
+    }
+
+    public static int fastParse(byte[] buf, int start, int len) {
+        int i = 0;
+        int sign = 1;
+        for (int index = start; index < start + len; index++) {
+            byte b = buf[index];
+            if (b == '-') {
+                sign = -1;
+            }
+            if (b >= '0' && b <= '9') {
+                i = i * 10 + (b - '0');
+            }
+        }
+        return i * sign;
+    }
+}

From 3c36b5b0a862d35b2d1879d329e3c13863092b6b Mon Sep 17 00:00:00 2001
From: Anita SV <anitasvasu@gmail.com>
Date: Sun, 14 Jan 2024 09:41:04 -0800
Subject: [PATCH 003/268] A SAFE and readable version (#388)

* A SAFE and readable version

* Remove unused functions

* Making it slower, removing custom hashMap
---
 calculate_average_anitasv.sh                  |  19 ++
 prepare_anitasv.sh                            |  19 ++
 .../onebrc/CalculateAverage_anitasv.java      | 215 ++++++++++++++++++
 3 files changed, 253 insertions(+)
 create mode 100755 calculate_average_anitasv.sh
 create mode 100755 prepare_anitasv.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java

diff --git a/calculate_average_anitasv.sh b/calculate_average_anitasv.sh
new file mode 100755
index 000000000..01d0d745b
--- /dev/null
+++ b/calculate_average_anitasv.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_anitasv
diff --git a/prepare_anitasv.sh b/prepare_anitasv.sh
new file mode 100755
index 000000000..f83a3ff69
--- /dev/null
+++ b/prepare_anitasv.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java
new file mode 100644
index 000000000..c15250d99
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java
@@ -0,0 +1,215 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public class CalculateAverage_anitasv {
+    private static final String FILE = "./measurements.txt";
+
+    private record Shard(MemorySegment mmapMemory,
+                         long chunkStart, long chunkEnd) {
+
+        byte getByte(long address) {
+            return mmapMemory.get(ValueLayout.JAVA_BYTE, address);
+        }
+
+        long indexOf(long position, byte ch) {
+            ByteBuffer buf = mmapMemory.asSlice(position,
+                            Math.min(128, mmapMemory.byteSize() - position))
+                    .asByteBuffer();
+            while (buf.hasRemaining()) {
+                if (buf.get() == ch) {
+                    return position + buf.position() - 1;
+                }
+            }
+            return -1;
+        }
+
+        byte[] getRange(long start, long end) {
+            return mmapMemory.asSlice(start, end - start).toArray(ValueLayout.JAVA_BYTE);
+        }
+
+        int parseDouble(long start, long end) {
+            int normalized = 0;
+            boolean sign = true;
+            long index = start;
+            if (getByte(index) == '-') {
+                index++;
+                sign = false;
+            }
+            boolean hasDot = false;
+            for (; index < end; index++) {
+                byte ch = getByte(index);
+                if (ch != '.') {
+                    normalized = normalized * 10 + (ch - '0');
+                } else {
+                    hasDot = true;
+                }
+            }
+            if (!hasDot) {
+                normalized *= 10;
+            }
+            if (!sign) {
+                normalized = -normalized;
+            }
+            return normalized;
+        }
+
+        public int computeHash(long position, long stationEnd) {
+            ByteBuffer buf2 = mmapMemory.asSlice(position, stationEnd - position)
+                    .asByteBuffer();
+            return buf2.hashCode();
+        }
+
+        public boolean matches(byte[] existingStation, long start, long end) {
+            ByteBuffer buf1 = ByteBuffer.wrap(existingStation);
+            ByteBuffer buf2 = mmapMemory.asSlice(start, end - start).asByteBuffer();
+            return buf1.equals(buf2);
+        }
+    }
+
+    private record ResultRow(byte[] station, IntSummaryStatistics statistics) {
+
+        public String toString() {
+            return STR."\{new String(station, StandardCharsets.UTF_8)} : \{statToString(statistics)}";
+        }
+    }
+
+    private static Map<String, IntSummaryStatistics> process(Shard shard) {
+        HashMap<Integer, List<ResultRow>> result = new HashMap<>(1 << 14);
+
+        boolean skip = shard.chunkStart != 0;
+        for (long position = shard.chunkStart; position < shard.chunkEnd; position++) {
+            if (skip) {
+                position = shard.indexOf(position, (byte) '\n');
+                skip = false;
+            }
+            else {
+                long stationEnd = shard.indexOf(position, (byte) ';');
+                int hash = shard.computeHash(position, stationEnd);
+
+                long temperatureEnd = shard.indexOf(stationEnd + 1, (byte) '\n');
+                int temperature = shard.parseDouble(stationEnd + 1, temperatureEnd);
+
+                List<ResultRow> collisions = result.get(hash);
+                if (collisions == null) {
+                    collisions = new ArrayList<>();
+                    result.put(hash, collisions);
+                }
+
+                boolean found = false;
+                for (ResultRow existing : collisions) {
+                    byte[] existingStation = existing.station();
+                    if (shard.matches(existingStation, position, stationEnd)) {
+                        existing.statistics.accept(temperature);
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    IntSummaryStatistics stats = new IntSummaryStatistics();
+                    stats.accept(temperature);
+                    ResultRow rr = new ResultRow(shard.getRange(position, stationEnd), stats);
+                    collisions.add(rr);
+                }
+                position = temperatureEnd;
+            }
+        }
+
+        return result.values()
+                .stream()
+                .flatMap(Collection::stream)
+                .map(rr -> new AbstractMap.SimpleImmutableEntry<>(
+                        new String(rr.station, StandardCharsets.UTF_8),
+                        rr.statistics))
+                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+    }
+
+    private static Map<String, IntSummaryStatistics> combineResults(List<Map<String, IntSummaryStatistics>> list) {
+
+        Map<String, IntSummaryStatistics> output = HashMap.newHashMap(1024);
+        for (Map<String, IntSummaryStatistics> map : list) {
+            for (Map.Entry<String, IntSummaryStatistics> entry : map.entrySet()) {
+                output.compute(entry.getKey(), (ignore, val) -> {
+                    if (val == null) {
+                        return entry.getValue();
+                    }
+                    else {
+                        val.combine(entry.getValue());
+                        return val;
+                    }
+                });
+            }
+        }
+
+        return output;
+    }
+
+    private static Map<String, IntSummaryStatistics> master(MemorySegment mmapMemory) {
+        long totalBytes = mmapMemory.byteSize();
+        int numWorkers = Runtime.getRuntime().availableProcessors();
+        long chunkSize = Math.ceilDiv(totalBytes, numWorkers);
+        return combineResults(IntStream.range(0, numWorkers)
+                .parallel()
+                .mapToObj(workerId -> {
+                    long chunkStart = workerId * chunkSize;
+                    long chunkEnd = Math.min(chunkStart + chunkSize + 1, totalBytes);
+                    return new Shard(mmapMemory, chunkStart, chunkEnd);
+                })
+                .map(CalculateAverage_anitasv::process)
+                .toList());
+    }
+
+    public static Map<String, IntSummaryStatistics> start() throws IOException {
+        try (FileChannel fileChannel = FileChannel.open(Path.of(FILE),
+                StandardOpenOption.READ)) {
+            long fileSize = fileChannel.size();
+            MemorySegment mmapMemory = fileChannel.map(
+                    FileChannel.MapMode.READ_ONLY,
+                    0, fileSize, Arena.global());
+            return master(mmapMemory);
+        }
+    }
+
+    private static Map<String, String> toPrintMap(Map<String, IntSummaryStatistics> output) {
+        Map<String, String> outputStr = new TreeMap<>();
+        for (Map.Entry<String, IntSummaryStatistics> entry : output.entrySet()) {
+            IntSummaryStatistics stat = entry.getValue();
+            outputStr.put(entry.getKey(), statToString(stat));
+        }
+        return outputStr;
+    }
+
+    private static String statToString(IntSummaryStatistics stat) {
+        return STR."\{stat.getMin() / 10.0}/\{Math.round(stat.getAverage()) / 10.0}/\{stat.getMax() / 10.0}";
+    }
+
+    public static void main(String[] args) throws IOException {
+        System.out.println(toPrintMap(start()));
+    }
+}

From 0ca7c485aa5e9192cc3fa957d0c7c17bc94d2c76 Mon Sep 17 00:00:00 2001
From: Dmitry Bufistov <dmitry.bufistov@midokura.com>
Date: Thu, 4 Jan 2024 22:07:28 +0100
Subject: [PATCH 004/268] Dmitry challenge

---
 calculate_average_dmitry-midokura.sh          |  20 +
 .../onebrc/CalculateAverage_bufistov.java     | 398 ++++++++++++++++++
 2 files changed, 418 insertions(+)
 create mode 100755 calculate_average_dmitry-midokura.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java

diff --git a/calculate_average_dmitry-midokura.sh b/calculate_average_dmitry-midokura.sh
new file mode 100755
index 000000000..e4d1366db
--- /dev/null
+++ b/calculate_average_dmitry-midokura.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+#JAVA_OPTS="-verbose:gc"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_bufistov $1 $2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java b/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java
new file mode 100644
index 000000000..db6040385
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java
@@ -0,0 +1,398 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import static java.lang.Math.toIntExact;
+
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.concurrent.Future;
+
+class ResultRow {
+    byte[] station;
+
+    String stationString;
+    long min, max, count, suma;
+
+    ResultRow() {
+    }
+
+    ResultRow(byte[] station, long value) {
+        this.station = new byte[station.length];
+        System.arraycopy(station, 0, this.station, 0, station.length);
+        this.min = value;
+        this.max = value;
+        this.count = 1;
+        this.suma = value;
+    }
+
+    ResultRow(long value) {
+        this.min = value;
+        this.max = value;
+        this.count = 1;
+        this.suma = value;
+    }
+
+    void setStation(MappedByteBuffer byteBuffer, int startPosition, int endPosition) {
+        this.station = new byte[endPosition - startPosition];
+        byteBuffer.slice(startPosition, station.length).get(this.station, 0, station.length);
+    }
+
+    public String toString() {
+        stationString = new String(station, StandardCharsets.UTF_8);
+        return stationString + "=" + round(min / 10.0) + "/" + round(suma / 10.0 / count) + "/" + round(max / 10.0);
+    }
+
+    private double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+
+    ResultRow update(long newValue) {
+        this.count += 1;
+        this.suma += newValue;
+        if (newValue < this.min) {
+            this.min = newValue;
+        }
+        else if (newValue > this.max) {
+            this.max = newValue;
+        }
+        return this;
+    }
+
+    ResultRow merge(ResultRow another) {
+        this.count += another.count;
+        this.suma += another.suma;
+        this.min = Math.min(this.min, another.min);
+        this.max = Math.max(this.max, another.max);
+        return this;
+    }
+}
+
+class ByteArrayWrapper {
+    private final byte[] data;
+
+    public ByteArrayWrapper(byte[] data) {
+        this.data = data;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        return Arrays.equals(data, ((ByteArrayWrapper) other).data);
+    }
+
+    @Override
+    public int hashCode() {
+        return Arrays.hashCode(data);
+    }
+}
+
+class OpenHash {
+    ResultRow[] data;
+    int dataSizeMask;
+
+    // ResultRow metrics = new ResultRow();
+
+    public OpenHash(int capacityPow2) {
+        assert capacityPow2 <= 20;
+        int dataSize = 1 << capacityPow2;
+        dataSizeMask = dataSize - 1;
+        data = new ResultRow[dataSize];
+    }
+
+    int hashByteArray(byte[] array) {
+        int result = 0;
+        long mask = 0;
+        for (int i = 0; i < array.length; ++i, mask = ((mask + 1) & 3)) {
+            result += array[i] << mask;
+        }
+        return result & dataSizeMask;
+    }
+
+    void merge(byte[] station, long value, int hashValue) {
+        while (data[hashValue] != null && !Arrays.equals(station, data[hashValue].station)) {
+            hashValue += 1;
+            hashValue &= dataSizeMask;
+        }
+        if (data[hashValue] == null) {
+            data[hashValue] = new ResultRow(station, value);
+        }
+        else {
+            data[hashValue].update(value);
+        }
+        // metrics.update(delta);
+    }
+
+    void merge(byte[] station, long value) {
+        merge(station, value, hashByteArray(station));
+    }
+
+    void merge(MappedByteBuffer byteBuffer, final int startPosition, final int endPosition, int hashValue, final long value) {
+        while (data[hashValue] != null && !equalsToStation(byteBuffer, startPosition, endPosition, data[hashValue].station)) {
+            hashValue += 1;
+            hashValue &= dataSizeMask;
+        }
+        if (data[hashValue] == null) {
+            data[hashValue] = new ResultRow(value);
+            data[hashValue].setStation(byteBuffer, startPosition, endPosition);
+        }
+        else {
+            data[hashValue].update(value);
+        }
+    }
+
+    boolean equalsToStation(MappedByteBuffer byteBuffer, int startPosition, int endPosition, byte[] station) {
+        if (endPosition - startPosition != station.length) {
+            return false;
+        }
+        for (int i = 0; i < station.length; ++i, ++startPosition) {
+            if (byteBuffer.get(startPosition) != station[i])
+                return false;
+        }
+        return true;
+    }
+
+    HashMap<ByteArrayWrapper, ResultRow> toJavaHashMap() {
+        HashMap<ByteArrayWrapper, ResultRow> result = new HashMap<>(20000);
+        for (int i = 0; i < data.length; ++i) {
+            if (data[i] != null) {
+                var key = new ByteArrayWrapper(data[i].station);
+                result.put(key, data[i]);
+            }
+        }
+        return result;
+    }
+}
+
+public class CalculateAverage_bufistov {
+
+    static final long LINE_SEPARATOR = '\n';
+
+    public static class FileRead implements Callable<HashMap<ByteArrayWrapper, ResultRow>> {
+
+        private final FileChannel fileChannel;
+        private long currentLocation;
+        private int bytesToRead;
+
+        private final int hashCapacityPow2 = 18;
+        private final int hashCapacityMask = (1 << hashCapacityPow2) - 1;
+
+        public FileRead(long startLocation, int bytesToRead, FileChannel fileChannel) {
+            this.currentLocation = startLocation;
+            this.bytesToRead = bytesToRead;
+            this.fileChannel = fileChannel;
+        }
+
+        @Override
+        public HashMap<ByteArrayWrapper, ResultRow> call() throws IOException {
+            try {
+                OpenHash openHash = new OpenHash(hashCapacityPow2);
+                log("Reading the channel: " + currentLocation + ":" + bytesToRead);
+                byte[] suffix = new byte[128];
+                if (currentLocation > 0) {
+                    toLineBegin(suffix);
+                }
+                while (bytesToRead > 0) {
+                    int bufferSize = Math.min(1 << 24, bytesToRead);
+                    MappedByteBuffer byteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, currentLocation, bufferSize);
+                    bytesToRead -= bufferSize;
+                    currentLocation += bufferSize;
+                    int suffixBytes = 0;
+                    if (currentLocation < fileChannel.size()) {
+                        suffixBytes = toLineBegin(suffix);
+                    }
+                    processChunk(byteBuffer, bufferSize, suffix, suffixBytes, openHash);
+                }
+                log("Done Reading the channel: " + currentLocation + ":" + bytesToRead);
+                return openHash.toJavaHashMap();
+            }
+            catch (Exception e) {
+                e.printStackTrace();
+                throw e;
+            }
+        }
+
+        byte getByte(long position) throws IOException {
+            MappedByteBuffer byteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, position, 1);
+            return byteBuffer.get();
+        }
+
+        int toLineBegin(byte[] suffix) throws IOException {
+            int bytesConsumed = 0;
+            if (getByte(currentLocation - 1) != LINE_SEPARATOR) {
+                while (getByte(currentLocation) != LINE_SEPARATOR) { // Small bug here if last chunk is less than a line and has no '\n' at the end. Valid input should have '\n' for all rows.
+                    suffix[bytesConsumed++] = getByte(currentLocation);
+                    ++currentLocation;
+                    --bytesToRead;
+                }
+                ++currentLocation;
+                --bytesToRead;
+            }
+            return bytesConsumed;
+        }
+
+        void processChunk(MappedByteBuffer byteBuffer, int bufferSize, byte[] suffix, int suffixBytes, OpenHash result) {
+            int nameBegin = 0;
+            int nameEnd = -1;
+            int numberBegin = -1;
+            int currentHash = 0;
+            int currentMask = 0;
+            int nameHash = 0;
+            for (int currentPosition = 0; currentPosition < bufferSize; ++currentPosition) {
+                byte nextByte = byteBuffer.get(currentPosition);
+                if (nextByte == ';') {
+                    nameEnd = currentPosition;
+                    numberBegin = currentPosition + 1;
+                    nameHash = currentHash & hashCapacityMask;
+                }
+                else if (nextByte == LINE_SEPARATOR) {
+                    long value = getValue(byteBuffer, numberBegin, currentPosition);
+                    // log("Station name: '" + getStationName(byteBuffer, nameBegin, nameEnd) + "' value: " + value + " hash: " + nameHash);
+                    result.merge(byteBuffer, nameBegin, nameEnd, nameHash, value);
+                    nameBegin = currentPosition + 1;
+                    currentHash = 0;
+                    currentMask = 0;
+                }
+                else {
+                    currentHash += (nextByte << currentMask);
+                    currentMask = (currentMask + 1) & 3;
+                }
+            }
+            if (nameBegin < bufferSize) {
+                byte[] lastLine = new byte[bufferSize - nameBegin + suffixBytes];
+                byte[] prefix = new byte[bufferSize - nameBegin];
+                byteBuffer.slice(nameBegin, prefix.length).get(prefix, 0, prefix.length);
+                System.arraycopy(prefix, 0, lastLine, 0, prefix.length);
+                System.arraycopy(suffix, 0, lastLine, prefix.length, suffixBytes);
+                processLastLine(lastLine, result);
+            }
+        }
+
+        void processLastLine(byte[] lastLine, OpenHash result) {
+            int numberBegin = -1;
+            byte[] stationName = null;
+            for (int i = 0; i < lastLine.length; ++i) {
+                if (lastLine[i] == ';') {
+                    stationName = new byte[i];
+                    System.arraycopy(lastLine, 0, stationName, 0, stationName.length);
+                    numberBegin = i + 1;
+                    break;
+                }
+            }
+            long value = getValue(lastLine, numberBegin);
+            // log("Station name: '" + new String(stationName, StandardCharsets.UTF_8) + "' value: " + value);
+            result.merge(stationName, value);
+        }
+
+        long getValue(MappedByteBuffer byteBuffer, int startLocation, int endLocation) {
+            byte nextByte = byteBuffer.get(startLocation);
+            boolean negate = nextByte == '-';
+            long result = negate ? 0 : nextByte - '0';
+            for (int i = startLocation + 1; i < endLocation; ++i) {
+                nextByte = byteBuffer.get(i);
+                if (nextByte != '.') {
+                    result *= 10;
+                    result += nextByte - '0';
+                }
+            }
+            return negate ? -result : result;
+        }
+
+        long getValue(byte[] lastLine, int startLocation) {
+            byte nextByte = lastLine[startLocation];
+            boolean negate = nextByte == '-';
+            long result = negate ? 0 : nextByte - '0';
+            for (int i = startLocation + 1; i < lastLine.length; ++i) {
+                nextByte = lastLine[i];
+                if (nextByte != '.') {
+                    result *= 10;
+                    result += nextByte - '0';
+                }
+            }
+            return negate ? -result : result;
+        }
+
+        String getStationName(MappedByteBuffer byteBuffer, int from, int to) {
+            byte[] bytes = new byte[to - from];
+            byteBuffer.slice(from, to - from).get(0, bytes);
+            return new String(bytes, StandardCharsets.UTF_8);
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        String fileName = "measurements.txt";
+        if (args.length > 0 && args[0].length() > 0) {
+            fileName = args[0];
+        }
+        log("InputFile: " + fileName);
+        FileInputStream fileInputStream = new FileInputStream(fileName);
+        int numThreads = 32;
+        if (args.length > 1) {
+            numThreads = Integer.parseInt(args[1]);
+        }
+        log("NumThreads: " + numThreads);
+        FileChannel channel = fileInputStream.getChannel();
+        final long fileSize = channel.size();
+        long remaining_size = fileSize;
+        long chunk_size = Math.min((fileSize + numThreads - 1) / numThreads, Integer.MAX_VALUE - 5);
+
+        ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+
+        long startLocation = 0;
+        ArrayList<Future<HashMap<ByteArrayWrapper, ResultRow>>> results = new ArrayList<>(numThreads);
+        while (remaining_size > 0) {
+            long actualSize = Math.min(chunk_size, remaining_size);
+            results.add(executor.submit(new FileRead(startLocation, toIntExact(actualSize), channel)));
+            remaining_size -= actualSize;
+            startLocation += actualSize;
+        }
+        executor.shutdown();
+
+        // Wait for all threads to finish
+        while (!executor.isTerminated()) {
+            Thread.yield();
+        }
+        log("Finished all threads");
+        fileInputStream.close();
+        HashMap<ByteArrayWrapper, ResultRow> result = new HashMap<>(20000);
+        for (var future : results) {
+            for (var entry : future.get().entrySet()) {
+                result.merge(entry.getKey(), entry.getValue(), ResultRow::merge);
+            }
+        }
+        ResultRow[] finalResult = result.values().toArray(new ResultRow[0]);
+        for (var row : finalResult) {
+            row.toString();
+        }
+        Arrays.sort(finalResult, Comparator.comparing(a -> a.stationString));
+        System.out.println("{" + String.join(", ", Arrays.stream(finalResult).map(ResultRow::toString).toList()) + "}");
+        log("All done!");
+    }
+
+    static void log(String message) {
+        // System.err.println(Instant.now() + "[" + Thread.currentThread().getName() + "]: " + message);
+    }
+}

From 32bb237091a5b97e6b8d6d6c1dc12a6045867ec3 Mon Sep 17 00:00:00 2001
From: Jin Cong Ho <hello@jincongho.com>
Date: Sun, 14 Jan 2024 17:50:24 +0000
Subject: [PATCH 005/268] Initial Submission (#389)

Co-authored-by: Gunnar Morling <gunnar.morling@googlemail.com>
---
 calculate_average_jincongho.sh                |  20 ++
 github_users.txt                              |   1 +
 .../onebrc/CalculateAverage_jincongho.java    | 285 ++++++++++++++++++
 3 files changed, 306 insertions(+)
 create mode 100755 calculate_average_jincongho.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java

diff --git a/calculate_average_jincongho.sh b/calculate_average_jincongho.sh
new file mode 100755
index 000000000..ec1ca426b
--- /dev/null
+++ b/calculate_average_jincongho.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED"
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jincongho
\ No newline at end of file
diff --git a/github_users.txt b/github_users.txt
index ef5ef51b7..54c443995 100644
--- a/github_users.txt
+++ b/github_users.txt
@@ -50,3 +50,4 @@ yehwankim23;김예환 Ye-Hwan Kim (Sam)
 hundredwatt;Jason Nochlin
 gnmathur;Gaurav Mathur
 vemana;Subrahmanyam
+jincongho;Jin Cong Ho
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
new file mode 100644
index 000000000..01220ffbc
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
@@ -0,0 +1,285 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * Changelog (based on Macbook Pro Intel i7 6-cores 2.6GHz):
+ *
+ * Initial                          40000 ms
+ * Parse key as byte vs string      30000 ms
+ * Parse temp as fixed vs double    15000 ms
+ * HashMap optimization             10000 ms
+ *
+ */
+public class CalculateAverage_jincongho {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static final Unsafe UNSAFE = initUnsafe();
+
+    private static Unsafe initUnsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Measurement Hash Table (for each partition)
+     * Uses contiguous byte array to optimize for cache-line (hopefully)
+     *
+     * Each entry:
+     * - KEYS: keyLength (2 bytes) + key (100 bytes)
+     * - VALUES: min (2 bytes) + max (2 bytes) + count (4 bytes) + sum ( 8 bytes)
+     */
+    protected static class PartitionAggr {
+
+        private static int MAP_SIZE = 1 << 14; // 2^14 = 16384, closes to 10000
+        private static int KEY_SIZE = 128; // key length (2 bytes) + key (100 bytes)
+        private static int KEY_MASK = (MAP_SIZE - 1);
+        private static int VALUE_SIZE = 16; // min (2 bytes) + max ( 2 bytes) + count (4 bytes) + sum (8 bytes)
+
+        private byte[] KEYS = new byte[MAP_SIZE * KEY_SIZE];
+        private byte[] VALUES = new byte[MAP_SIZE * VALUE_SIZE];
+
+        public PartitionAggr() {
+            // init min and max
+            for (int offset = UNSAFE.ARRAY_BYTE_BASE_OFFSET; offset < UNSAFE.ARRAY_BYTE_BASE_OFFSET + (MAP_SIZE * VALUE_SIZE); offset += VALUE_SIZE) {
+                UNSAFE.putShort(VALUES, offset, Short.MAX_VALUE);
+                UNSAFE.putShort(VALUES, offset + 2, Short.MIN_VALUE);
+            }
+        }
+
+        public void update(byte[] key, int hash, short keyLength, short value) {
+            int index = hash & KEY_MASK;
+            int keyOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (index * KEY_SIZE);
+            while (((UNSAFE.getShort(KEYS, keyOffset) != keyLength) ||
+                    !equals(KEYS, ((index * KEY_SIZE) + 2), key, 0, keyLength))) {
+                if (UNSAFE.getShort(KEYS, keyOffset) == 0) {
+                    // put key
+                    UNSAFE.putShort(KEYS, keyOffset, keyLength);
+                    UNSAFE.copyMemory(key, UNSAFE.ARRAY_BYTE_BASE_OFFSET, KEYS, keyOffset + 2, keyLength);
+                    break;
+                }
+                else {
+                    index = (index + 1) & KEY_MASK;
+                    keyOffset += KEY_SIZE;
+                }
+            }
+
+            long valueOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (index * VALUE_SIZE);
+            UNSAFE.putShort(VALUES, valueOffset, (short) Math.min(UNSAFE.getShort(VALUES, valueOffset), value));
+            valueOffset += 2;
+            UNSAFE.putShort(VALUES, valueOffset, (short) Math.max(UNSAFE.getShort(VALUES, valueOffset), value));
+            valueOffset += 2;
+            UNSAFE.putInt(VALUES, valueOffset, UNSAFE.getInt(VALUES, valueOffset) + 1);
+            valueOffset += 4;
+            UNSAFE.putLong(VALUES, valueOffset, UNSAFE.getLong(VALUES, valueOffset) + value);
+        }
+
+        private boolean equals(byte[] a, int aOffset, byte[] b, int bOffset, int len) {
+            while (bOffset < len)
+                if (a[aOffset++] != b[bOffset++])
+                    return false;
+            return true;
+        }
+
+        public void mergeTo(ResultAggr result) {
+            long keyOffset;
+            short keyLength;
+            for (int i = 0; i < MAP_SIZE; i++) {
+                // extract key
+                keyOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (i * KEY_SIZE);
+                if ((keyLength = UNSAFE.getShort(KEYS, keyOffset)) == 0)
+                    continue;
+
+                // extract values (if key is not null)
+                final long valueOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (i * VALUE_SIZE);
+                result.compute(new String(KEYS, (i * KEY_SIZE) + 2, keyLength, StandardCharsets.UTF_8), (k, v) -> {
+                    short min = UNSAFE.getShort(VALUES, valueOffset);
+                    short max = UNSAFE.getShort(VALUES, valueOffset + 2);
+                    int count = UNSAFE.getInt(VALUES, valueOffset + 4);
+                    long sum = UNSAFE.getLong(VALUES, valueOffset + 8);
+
+                    if (v == null) {
+                        return new ResultAggr.Measurement(min, max, count, sum);
+                    }
+                    else {
+                        return v.update(min, max, count, sum);
+                    }
+                });
+            }
+        }
+
+    }
+
+    /**
+     * Measurement Aggregation (for all partitions)
+     * Simple Concurrent Hash Table so all partitions can merge concurrently
+     */
+    protected static class ResultAggr extends ConcurrentHashMap<String, ResultAggr.Measurement> {
+
+        protected static class Measurement {
+            public short min;
+            public short max;
+            public int count;
+            public long sum;
+
+            public Measurement(short min, short max, int count, long sum) {
+                this.min = min;
+                this.max = max;
+                this.count = count;
+                this.sum = sum;
+            }
+
+            public ResultAggr.Measurement update(short min, short max, int count, long sum) {
+                this.min = (short) Math.min(min, this.min);
+                this.max = (short) Math.max(max, this.max);
+                this.count += count;
+                this.sum += sum;
+
+                return this;
+            }
+
+            @Override
+            public String toString() {
+                return ((double) min / 10) + "/" + (Math.round((1.0 * sum) / count) / 10.0) + "/" + ((double) max / 10);
+            }
+
+        }
+
+        public Map toSorted() {
+            return new TreeMap(this);
+        }
+
+    }
+
+    protected static class Partition implements Runnable {
+
+        private final MemorySegment data;
+        private long offset;
+        private final long limit;
+        private final ResultAggr result;
+
+        public Partition(MemorySegment data, long offset, long limit, ResultAggr result) {
+            this.data = data;
+            this.offset = data.address() + offset;
+            this.limit = data.address() + limit;
+            this.result = result;
+        }
+
+        @Override
+        public void run() {
+            // measurement parsing
+            PartitionAggr aggr = new PartitionAggr();
+            byte[] stationName = new byte[128];
+            short stationLength;
+            int hash;
+            byte tempBuffer;
+            while (offset < limit) {
+                // find station name upto ";"
+                hash = 1;
+                stationLength = 0;
+                while ((stationName[stationLength] = UNSAFE.getByte(offset++)) != ';')
+                    hash = hash * 31 + stationName[stationLength++];
+
+                // find measurement upto "\n"
+                tempBuffer = UNSAFE.getByte(offset++);
+                boolean isNegative = (tempBuffer == '-');
+                short fixed = (short) (isNegative ? 0 : (tempBuffer - '0'));
+                while (true) {
+                    tempBuffer = UNSAFE.getByte(offset++);
+                    if (tempBuffer == '.') {
+                        fixed = (short) (fixed * 10 + (UNSAFE.getByte(offset) - '0'));
+                        offset += 2;
+                        break;
+                    }
+                    fixed = (short) (fixed * 10 + (tempBuffer - '0'));
+                }
+                fixed = isNegative ? (short) -fixed : fixed;
+
+                // update measurement
+                aggr.update(stationName, hash, stationLength, fixed);
+            }
+
+            // measurement result collection
+            aggr.mergeTo(result);
+        }
+
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+
+        // long startTime = System.currentTimeMillis();
+
+        try (FileChannel fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE), EnumSet.of(StandardOpenOption.READ));
+                Arena arena = Arena.ofShared()) {
+
+            // scan data
+            MemorySegment data = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size(), arena);
+            final int processors = Runtime.getRuntime().availableProcessors();
+
+            // partition split
+            long[] partition = new long[processors + 1];
+            long partitionSize = Math.ceilDiv(data.byteSize(), processors);
+            for (int i = 0; i < processors; i++) {
+                partition[i + 1] = partition[i] + partitionSize;
+                if (partition[i + 1] >= data.byteSize()) {
+                    partition[i + 1] = data.byteSize();
+                    break;
+                }
+                while (UNSAFE.getByte(data.address() + partition[i + 1]++) != '\n')
+                    ;
+            }
+
+            // partition aggregation
+            var threadList = new Thread[processors];
+            ResultAggr result = new ResultAggr();
+            for (int i = 0; i < processors; i++) {
+                threadList[i] = new Thread(new Partition(data, partition[i], partition[i + 1], result));
+                threadList[i].start();
+            }
+            for (var thread : threadList) {
+                thread.join();
+            }
+
+            System.out.println(result.toSorted());
+        }
+
+        // long elapsed = System.currentTimeMillis() - startTime;
+        // System.out.println("Elapsed: " + ((double) elapsed / 1000.0));
+
+    }
+
+}

From fc6fca43152b82acfbe927602c92abc3bcda1dd6 Mon Sep 17 00:00:00 2001
From: Arjen Wisse <arjenw@users.noreply.github.com>
Date: Sun, 14 Jan 2024 19:03:07 +0100
Subject: [PATCH 006/268] My attempt to parse it quickly (#401)

* My approach

* Update calculate_average_arjenw.sh

---------

Co-authored-by: Gunnar Morling <gunnar.morling@googlemail.com>
---
 calculate_average_arjenw.sh                   |  20 ++
 .../onebrc/CalculateAverage_arjenw.java       | 233 ++++++++++++++++++
 2 files changed, 253 insertions(+)
 create mode 100755 calculate_average_arjenw.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java

diff --git a/calculate_average_arjenw.sh b/calculate_average_arjenw.sh
new file mode 100755
index 000000000..73391a776
--- /dev/null
+++ b/calculate_average_arjenw.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xms500m -Xmx500m --enable-preview -dsa -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:-AlwaysPreTouch"
+
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_arjenw
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java b/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java
new file mode 100644
index 000000000..0f5f3fe68
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java
@@ -0,0 +1,233 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
+import java.util.stream.IntStream;
+
+// Calculate Average
+// * baseline:                              3m7s
+// * single-threaded chunk-based reading:   0m45s
+// * multi-threaded chunk-based reading:    0m14s
+// * less branches in parsing:              0m12s
+// * list approach iso map:                 0m5.5s
+// * chunk finetuning:                      0m4.5s
+// * threadlocal result gathering:          0m4.3s (trying graalvm-ce)
+// * memory-mapped file approach:           0m3.2s (also way simpler and neater code; inspired by spullara)
+// * smarter number parsing:                0m2.95s (inspired by iziamos)
+// * switching back to 21-tem vm            0m2.6s
+
+public class CalculateAverage_arjenw {
+    private static final int TWO_BYTE_TO_INT = 480 + 48;
+    private static final int THREE_BYTE_TO_INT = 4800 + 480 + 48;
+    private static final String FILE = "./measurements.txt";
+
+    public static void main(String[] args) {
+        var file = new File(FILE);
+        var fileSize = file.length();
+        var numberOfProcessors = fileSize > 1_000_000 ? Runtime.getRuntime().availableProcessors() : 1;
+        var segmentSize = fileSize / numberOfProcessors;
+        var results = IntStream.range(0, numberOfProcessors)
+                .mapToObj(segmentNr -> parseSegment(file, fileSize, segmentSize, segmentNr))
+                .parallel()
+                .reduce(StationList::merge)
+                .orElseGet(StationList::new)
+                .toStringArray();
+        Arrays.sort(results, Comparator.comparing(o -> take(o, '=')));
+        System.out.format("{%s}%n", String.join(", ", results));
+    }
+
+    private static StationList parseSegment(File file, long fileSize, long segmentSize, int segmentNr) {
+        long segmentStart = segmentNr * segmentSize;
+        long segmentEnd = Math.min(fileSize, segmentStart + segmentSize + 100);
+        StationList stationList = new StationList();
+        try (var fileChannel = (FileChannel) Files.newByteChannel(file.toPath(), StandardOpenOption.READ)) {
+            var bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, segmentStart, segmentEnd - segmentStart);
+            if (segmentStart > 0) {
+                while (bb.get() != '\n')
+                    ; // skip to first new line
+            }
+            var buffer = new byte[100];
+            while (bb.position() < segmentSize) {
+                byte b;
+                var i = 0;
+                int hash = 0;
+                while ((b = bb.get()) != ';') {
+                    hash = hash * 31 + b;
+                    buffer[i++] = b;
+                }
+
+                int value;
+                byte b1 = bb.get();
+                byte b2 = bb.get();
+                byte b3 = bb.get();
+                byte b4 = bb.get();
+                if (b2 == '.') {// value is n.n
+                    value = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
+                    // b4 == \n
+                }
+                else {
+                    if (b4 == '.') { // value is -nn.n
+                        value = -(b2 * 100 + b3 * 10 + bb.get() - THREE_BYTE_TO_INT);
+                    }
+                    else if (b1 == '-') { // value is -n.n
+                        value = -(b2 * 10 + b4 - TWO_BYTE_TO_INT);
+                    }
+                    else { // value is nn.n
+                        value = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
+                    }
+                    bb.get(); // new line
+                }
+
+                stationList.add(buffer, i, Math.abs(hash), value);
+            }
+            return stationList;
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static final class Station {
+        private final byte[] data;
+        private final int hash;
+        private final int length;
+
+        private int min;
+        private int max;
+        private int total;
+        private int count;
+
+        private Station(byte[] data, int length, int hash, int value) {
+            this.data = data;
+            this.hash = hash;
+            this.length = length;
+
+            min = max = total = value;
+            count = 1;
+        }
+
+        @Override
+        public String toString() {
+            return STR."\{new String(data, 0, length, StandardCharsets.UTF_8)}=\{min / 10.0}/\{Math.round(((double) total) / count) / 10.0}/\{max / 10.0}";
+        }
+
+        private void append(int min, int max, int total, int count) {
+            if (min < this.min)
+                this.min = min;
+            if (max > this.max)
+                this.max = max;
+            this.total += total;
+            this.count += count;
+        }
+
+        public void append(int value) {
+            append(value, value, value, 1);
+        }
+
+        public void merge(Station other) {
+            append(other.min, other.max, other.total, other.count);
+        }
+    }
+
+    private static class StationList implements Iterable<Station> {
+        private final static int MAX_ENTRY = 32767; // choose a value that is binary all 1's.
+        private final Station[] array = new Station[MAX_ENTRY + 1];
+        private int size = 0;
+
+        private void add(int hash, Supplier<Station> create, Consumer<Station> update) {
+            var position = hash & MAX_ENTRY;
+            Station existing;
+            while ((existing = array[position]) != null && existing.hash != hash) {
+                position = (position + 1) & MAX_ENTRY;
+            }
+            if (existing == null) {
+                array[position] = create.get();
+                size++;
+            }
+            else {
+                update.accept(existing);
+            }
+        }
+
+        public void add(byte[] data, int stationNameLength, int stationHash, int value) {
+            add(stationHash, () -> {
+                var stationName = new byte[stationNameLength];
+                System.arraycopy(data, 0, stationName, 0, stationNameLength);
+                return new Station(stationName, stationNameLength, stationHash, value);
+            }, existing -> existing.append(value));
+        }
+
+        public void add(Station station) {
+            add(station.hash, () -> station, existing -> existing.merge(station));
+        }
+
+        public String[] toStringArray() {
+            var destination = new String[size];
+
+            var i = 0;
+            for (Station station : this)
+                destination[i++] = station.toString();
+
+            return destination;
+        }
+
+        public StationList merge(StationList other) {
+            for (Station station : other)
+                add(station);
+            return this;
+        }
+
+        @Override
+        public Iterator<Station> iterator() {
+            return new Iterator<>() {
+                private int index = 0;
+
+                @Override
+                public boolean hasNext() {
+                    Station station = null;
+                    while (index <= MAX_ENTRY && (station = array[index]) == null)
+                        index++;
+                    return station != null;
+                }
+
+                @Override
+                public Station next() {
+                    if (hasNext()) {
+                        return array[index++];
+                    }
+                    throw new NoSuchElementException();
+                }
+            };
+        }
+    }
+
+    private static String take(String s, char c) {
+        var pos = s.indexOf(c);
+        return pos > -1 ? s.substring(0, pos) : s;
+    }
+}

From 3fbc4a2fa89e199ab1289cd8ac5d496009120c82 Mon Sep 17 00:00:00 2001
From: Stefan Sprenger <stefan@datacater.io>
Date: Sun, 14 Jan 2024 19:06:01 +0100
Subject: [PATCH 007/268] Update submission (#385)

* feat(flippingbits): Improve parsing of station names

* chore(flippingbits): Remove obsolete import

* feat(flippingbits): Use custom hash map

* feat(flippingbits): Use UNSAFE

* fix(flippingbits): Support very small files

* chore(flippingbits): Few cleanups

* chore(flippingbits): Align names

* fix(flippingbits): Initialize hash with first byte

* fix(flippingbits): Fix initialization of hash value
---
 calculate_average_flippingbits.sh             |   2 +-
 .../onebrc/CalculateAverage_flippingbits.java | 290 ++++++++++++------
 2 files changed, 202 insertions(+), 90 deletions(-)

diff --git a/calculate_average_flippingbits.sh b/calculate_average_flippingbits.sh
index b37baa0e5..7dcbe74bb 100755
--- a/calculate_average_flippingbits.sh
+++ b/calculate_average_flippingbits.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--add-modules=jdk.incubator.vector"
+JAVA_OPTS="--add-modules=jdk.incubator.vector --enable-preview"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_flippingbits
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java b/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java
index 2510d8526..3489877f6 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java
@@ -18,8 +18,13 @@
 import jdk.incubator.vector.ShortVector;
 import jdk.incubator.vector.VectorOperators;
 
+import sun.misc.Unsafe;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
+
 import java.io.IOException;
 import java.io.RandomAccessFile;
+import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 
@@ -34,14 +39,31 @@ public class CalculateAverage_flippingbits {
 
     private static final String FILE = "./measurements.txt";
 
-    private static final long CHUNK_SIZE = 10 * 1024 * 1024; // 10 MB
+    private static final long MINIMUM_FILE_SIZE_PARTITIONING = 10 * 1024 * 1024; // 10 MB
 
     private static final int SIMD_LANE_LENGTH = ShortVector.SPECIES_MAX.length();
 
-    private static final int MAX_STATION_NAME_LENGTH = 100;
+    private static final int NUM_STATIONS = 10_000;
+
+    private static final int HASH_MAP_OFFSET_CAPACITY = 200_000;
+
+    private static final Unsafe UNSAFE = initUnsafe();
+
+    private static int HASH_PRIME_NUMBER = 31;
+
+    private static Unsafe initUnsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
 
     public static void main(String[] args) throws IOException {
-        var result = Arrays.asList(getSegments()).stream()
+        var result = Arrays.asList(getSegments()).parallelStream()
                 .map(segment -> {
                     try {
                         return processSegment(segment[0], segment[1]);
@@ -50,126 +72,137 @@ public static void main(String[] args) throws IOException {
                         throw new RuntimeException(e);
                     }
                 })
-                .parallel()
-                .reduce((firstMap, secondMap) -> {
-                    for (var entry : secondMap.entrySet()) {
-                        PartitionAggregate firstAggregate = firstMap.get(entry.getKey());
-                        if (firstAggregate == null) {
-                            firstMap.put(entry.getKey(), entry.getValue());
-                        }
-                        else {
-                            firstAggregate.mergeWith(entry.getValue());
-                        }
-                    }
-                    return firstMap;
-                })
-                .map(TreeMap::new).get();
+                .reduce(FasterHashMap::mergeWith)
+                .get();
+
+        var sortedMap = new TreeMap<String, Station>();
+        for (Station station : result.getEntries()) {
+            sortedMap.put(station.getName(), station);
+        }
 
-        System.out.println(result);
+        System.out.println(sortedMap);
     }
 
     private static long[][] getSegments() throws IOException {
         try (var file = new RandomAccessFile(FILE, "r")) {
-            var fileSize = file.length();
+            var channel = file.getChannel();
+
+            var fileSize = channel.size();
+            var startAddress = channel
+                    .map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global())
+                    .address();
+
             // Split file into segments, so we can work around the size limitation of channels
-            var numSegments = (int) (fileSize / CHUNK_SIZE);
+            var numSegments = (fileSize > MINIMUM_FILE_SIZE_PARTITIONING)
+                    ? Runtime.getRuntime().availableProcessors()
+                    : 1;
+            var segmentSize = fileSize / numSegments;
 
-            var boundaries = new long[numSegments + 1][2];
-            var endPointer = 0L;
+            var boundaries = new long[numSegments][2];
+            var endPointer = startAddress;
 
-            for (var i = 0; i < numSegments; i++) {
+            for (var i = 0; i < numSegments - 1; i++) {
                 // Start of segment
-                boundaries[i][0] = Math.min(Math.max(endPointer, i * CHUNK_SIZE), fileSize);
-
-                // Seek end of segment, limited by the end of the file
-                file.seek(Math.min(boundaries[i][0] + CHUNK_SIZE - 1, fileSize));
+                boundaries[i][0] = endPointer;
 
                 // Extend segment until end of line or file
-                while (file.read() != '\n') {
+                endPointer = endPointer + segmentSize;
+                while (UNSAFE.getByte(endPointer) != '\n') {
+                    endPointer++;
                 }
 
                 // End of segment
-                endPointer = file.getFilePointer();
-                boundaries[i][1] = endPointer;
+                boundaries[i][1] = endPointer++;
             }
 
-            boundaries[numSegments][0] = Math.max(endPointer, numSegments * CHUNK_SIZE);
-            boundaries[numSegments][1] = fileSize;
+            boundaries[numSegments - 1][0] = endPointer;
+            boundaries[numSegments - 1][1] = startAddress + fileSize;
 
             return boundaries;
         }
     }
 
-    private static Map<String, PartitionAggregate> processSegment(long startOfSegment, long endOfSegment)
-            throws IOException {
-        Map<String, PartitionAggregate> stationAggregates = new HashMap<>(50_000);
-        var byteChunk = new byte[(int) (endOfSegment - startOfSegment)];
-        var stationBuffer = new byte[MAX_STATION_NAME_LENGTH];
-        try (var file = new RandomAccessFile(FILE, "r")) {
-            file.seek(startOfSegment);
-            file.read(byteChunk);
-            var i = 0;
-            while (i < byteChunk.length) {
-                // Station name has at least one byte
-                stationBuffer[0] = byteChunk[i];
-                i++;
-                // Read station name
-                var j = 1;
-                while (byteChunk[i] != ';') {
-                    stationBuffer[j] = byteChunk[i];
-                    j++;
-                    i++;
-                }
-                var station = new String(stationBuffer, 0, j, StandardCharsets.UTF_8);
+    private static FasterHashMap processSegment(long startOfSegment, long endOfSegment) throws IOException {
+        var fasterHashMap = new FasterHashMap();
+        for (var i = startOfSegment; i < endOfSegment; i += 3) {
+            // Read station name
+            int nameHash = UNSAFE.getByte(i);
+            final var nameStartAddress = i++;
+            var character = UNSAFE.getByte(i);
+            while (character != ';') {
+                nameHash = nameHash * HASH_PRIME_NUMBER + character;
                 i++;
+                character = UNSAFE.getByte(i);
+            }
+            var nameLength = (int) (i - nameStartAddress);
+            i++;
 
-                // Read measurement
-                var isNegative = byteChunk[i] == '-';
-                var measurement = 0;
-                if (isNegative) {
+            // Read measurement
+            var isNegative = UNSAFE.getByte(i) == '-';
+            var measurement = 0;
+            if (isNegative) {
+                i++;
+                character = UNSAFE.getByte(i);
+                while (character != '.') {
+                    measurement = measurement * 10 + character - '0';
                     i++;
-                    while (byteChunk[i] != '.') {
-                        measurement = measurement * 10 + byteChunk[i] - '0';
-                        i++;
-                    }
-                    measurement = (measurement * 10 + byteChunk[i + 1] - '0') * -1;
+                    character = UNSAFE.getByte(i);
                 }
-                else {
-                    while (byteChunk[i] != '.') {
-                        measurement = measurement * 10 + byteChunk[i] - '0';
-                        i++;
-                    }
-                    measurement = measurement * 10 + byteChunk[i + 1] - '0';
+                measurement = (measurement * 10 + UNSAFE.getByte(i + 1) - '0') * -1;
+            }
+            else {
+                character = UNSAFE.getByte(i);
+                while (character != '.') {
+                    measurement = measurement * 10 + character - '0';
+                    i++;
+                    character = UNSAFE.getByte(i);
                 }
-
-                // Update aggregate
-                var aggregate = stationAggregates.computeIfAbsent(station, x -> new PartitionAggregate());
-                aggregate.addMeasurementAndComputeAggregate((short) measurement);
-                i += 3;
+                measurement = measurement * 10 + UNSAFE.getByte(i + 1) - '0';
             }
-            stationAggregates.values().forEach(PartitionAggregate::aggregateRemainingMeasurements);
+
+            fasterHashMap.addEntry(nameHash, nameLength, nameStartAddress, (short) measurement);
+        }
+
+        for (Station station : fasterHashMap.getEntries()) {
+            station.aggregateRemainingMeasurements();
         }
 
-        return stationAggregates;
+        return fasterHashMap;
     }
 
-    private static class PartitionAggregate {
-        final short[] doubleLane = new short[SIMD_LANE_LENGTH * 2];
+    private static class Station {
+        final short[] measurements = new short[SIMD_LANE_LENGTH * 2];
         // Assume that we do not have more than Integer.MAX_VALUE measurements for the same station per partition
-        int count = 0;
+        int count = 1;
         long sum = 0;
         short min = Short.MAX_VALUE;
         short max = Short.MIN_VALUE;
+        final long nameAddress;
+        final int nameLength;
+        final int nameHash;
+
+        public Station(int nameHash, int nameLength, long nameAddress, short measurement) {
+            this.nameHash = nameHash;
+            this.nameLength = nameLength;
+            this.nameAddress = nameAddress;
+            measurements[0] = measurement;
+        }
+
+        public String getName() {
+            byte[] name = new byte[nameLength];
+            UNSAFE.copyMemory(null, nameAddress, name, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameLength);
+            return new String(name, StandardCharsets.UTF_8);
+        }
 
         public void addMeasurementAndComputeAggregate(short measurement) {
             // Add measurement to buffer, which is later processed by SIMD instructions
-            doubleLane[count % doubleLane.length] = measurement;
+            measurements[count % measurements.length] = measurement;
             count++;
 
             // Once lane is full, use SIMD instructions to calculate aggregates
-            if (count % doubleLane.length == 0) {
-                var firstVector = ShortVector.fromArray(ShortVector.SPECIES_MAX, doubleLane, 0);
-                var secondVector = ShortVector.fromArray(ShortVector.SPECIES_MAX, doubleLane, SIMD_LANE_LENGTH);
+            if (count % measurements.length == 0) {
+                var firstVector = ShortVector.fromArray(ShortVector.SPECIES_MAX, measurements, 0);
+                var secondVector = ShortVector.fromArray(ShortVector.SPECIES_MAX, measurements, SIMD_LANE_LENGTH);
 
                 var simdMin = firstVector.min(secondVector).reduceLanes(VectorOperators.MIN);
                 min = (short) Math.min(min, simdMin);
@@ -182,19 +215,35 @@ public void addMeasurementAndComputeAggregate(short measurement) {
         }
 
         public void aggregateRemainingMeasurements() {
-            for (var i = 0; i < count % doubleLane.length; i++) {
-                var measurement = doubleLane[i];
+            for (var i = 0; i < count % measurements.length; i++) {
+                var measurement = measurements[i];
                 min = (short) Math.min(min, measurement);
                 max = (short) Math.max(max, measurement);
                 sum += measurement;
             }
         }
 
-        public void mergeWith(PartitionAggregate otherAggregate) {
-            min = (short) Math.min(min, otherAggregate.min);
-            max = (short) Math.max(max, otherAggregate.max);
-            count = count + otherAggregate.count;
-            sum = sum + otherAggregate.sum;
+        public void mergeWith(Station otherStation) {
+            min = (short) Math.min(min, otherStation.min);
+            max = (short) Math.max(max, otherStation.max);
+            count = count + otherStation.count;
+            sum = sum + otherStation.sum;
+        }
+
+        public boolean nameEquals(long otherNameAddress) {
+            var swarLimit = (nameLength / Long.BYTES) * Long.BYTES;
+            var i = 0;
+            for (; i < swarLimit; i += Long.BYTES) {
+                if (UNSAFE.getLong(nameAddress + i) != UNSAFE.getLong(otherNameAddress + i)) {
+                    return false;
+                }
+            }
+            for (; i < nameLength; i++) {
+                if (UNSAFE.getByte(nameAddress + i) != UNSAFE.getByte(otherNameAddress + i)) {
+                    return false;
+                }
+            }
+            return true;
         }
 
         public String toString() {
@@ -206,4 +255,67 @@ public String toString() {
                     (max / 10.0));
         }
     }
+
+    /**
+     * Use two arrays for implementing the hash map:
+     * - The array `entries` holds the map values, in our case instances of the class Station.
+     * - The array `offsets` maps hashes of the keys to indexes in the `entries` array.
+     *
+     * We create `offsets` with a much larger capacity than `entries`, so we minimize collisions.
+     */
+    private static class FasterHashMap {
+        // Using 16-bit integers (shorts) for offsets supports up to 2^15 (=32,767) entries
+        // If you need to store more entries, consider replacing short with int
+        short[] offsets = new short[HASH_MAP_OFFSET_CAPACITY];
+        Station[] entries = new Station[NUM_STATIONS + 1];
+        int slotsInUse = 0;
+
+        private int getOffsetIdx(int nameHash, int nameLength, long nameAddress) {
+            var offsetIdx = nameHash & (offsets.length - 1);
+            var offset = offsets[offsetIdx];
+
+            while (offset != 0 &&
+                    (nameLength != entries[offset].nameLength || !entries[offset].nameEquals(nameAddress))) {
+                offsetIdx = (offsetIdx + 1) % offsets.length;
+                offset = offsets[offsetIdx];
+            }
+
+            return offsetIdx;
+        }
+
+        public void addEntry(int nameHash, int nameLength, long nameAddress, short measurement) {
+            var offsetIdx = getOffsetIdx(nameHash, nameLength, nameAddress);
+            var offset = offsets[offsetIdx];
+
+            if (offset == 0) {
+                slotsInUse++;
+                entries[slotsInUse] = new Station(nameHash, nameLength, nameAddress, measurement);
+                offsets[offsetIdx] = (short) slotsInUse;
+            }
+            else {
+                entries[offset].addMeasurementAndComputeAggregate(measurement);
+            }
+        }
+
+        public FasterHashMap mergeWith(FasterHashMap otherMap) {
+            for (Station station : otherMap.getEntries()) {
+                var offsetIdx = getOffsetIdx(station.nameHash, station.nameLength, station.nameAddress);
+                var offset = offsets[offsetIdx];
+
+                if (offset == 0) {
+                    slotsInUse++;
+                    entries[slotsInUse] = station;
+                    offsets[offsetIdx] = (short) slotsInUse;
+                }
+                else {
+                    entries[offset].mergeWith(station);
+                }
+            }
+            return this;
+        }
+
+        public List<Station> getEntries() {
+            return Arrays.asList(entries).subList(1, slotsInUse + 1);
+        }
+    }
 }

From 30987d778c4be47f975af275f9cb121cb3149fe5 Mon Sep 17 00:00:00 2001
From: Jesse Van Rooy <jessevanrooy@hotmail.com>
Date: Sun, 14 Jan 2024 19:09:58 +0100
Subject: [PATCH 008/268] CalculateAverage_JesseVanRooy (Submission 1) (#335)

* Submission #1

* Submission #1 (Fixed casing of file names)

* Submission #1 (Added executable to Git permissions)

* Submission 1 (Fixed incorrect map size)

* Submission 1 (Fixed output problems on Windows)
---
 calculate_average_JesseVanRooy.sh             |  20 ++
 .../onebrc/CalculateAverage_JesseVanRooy.java | 256 ++++++++++++++++++
 2 files changed, 276 insertions(+)
 create mode 100755 calculate_average_JesseVanRooy.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java

diff --git a/calculate_average_JesseVanRooy.sh b/calculate_average_JesseVanRooy.sh
new file mode 100755
index 000000000..c680e974c
--- /dev/null
+++ b/calculate_average_JesseVanRooy.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="--enable-preview -XX:-TieredCompilation -Dsun.stdout.encoding=UTF-8"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_JesseVanRooy
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java
new file mode 100644
index 000000000..ba0475e35
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java
@@ -0,0 +1,256 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.stream.IntStream;
+
+//Disclaimer: The idea from the segmentation into #core amount of chunks came from previously submitted solutions.
+public class CalculateAverage_JesseVanRooy {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static final ValueLayout.OfByte DATA_LAYOUT = ValueLayout.JAVA_BYTE;
+
+    private static final Unsafe UNSAFE = initUnsafe();
+
+    private static Unsafe initUnsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static class Result {
+        long nameStart;
+        long nameSize;
+        String name;
+        int min;
+        int max;
+        long sum;
+        int count;
+
+        double min() {
+            return min / 10.0;
+        }
+
+        double max() {
+            return max / 10.0;
+        }
+
+        double mean() {
+            return (sum / 10.0) / count;
+        }
+    }
+
+    public static class ThreadResult {
+        Result[] results;
+    }
+
+    static final int MAP_SIZE = 16384;
+    static final int MAP_MASK = MAP_SIZE - 1;
+    static final int VALUE_CAPACITY = 10000;
+
+    static void process(MemorySegment memorySegment, ThreadResult threadResult) {
+        // initialize hash table
+        final int[] keys = new int[MAP_SIZE];
+        Arrays.fill(keys, -1);
+        final Result[] values = new Result[MAP_SIZE];
+
+        // pre-create the result objects
+        final Result[] preCreatedResults = new Result[VALUE_CAPACITY];
+        int usedPreCreatedResults = 0;
+        for (int i = 0; i < VALUE_CAPACITY; i++)
+            preCreatedResults[i] = new Result();
+
+        // load address info
+        final long size = memorySegment.byteSize();
+        final long address = memorySegment.address();
+        final long end = address + size;
+
+        for (long index = address; index < end;) {
+            final long nameStart = index;
+
+            byte next = UNSAFE.getByte(index);
+
+            // hash the city name
+            int hash = 0;
+            while (next != ';') {
+                hash = (hash * 33) + next;
+
+                index++;
+                next = UNSAFE.getByte(index);
+            }
+
+            final long nameEnd = index;
+
+            // skip the separator
+            index++;
+            next = UNSAFE.getByte(index);
+
+            // check for negative
+            boolean negative = next == '-';
+            if (negative) {
+                index++;
+                next = UNSAFE.getByte(index);
+            }
+
+            // count the temperature
+            int temperature = next - '0';
+            index++;
+            next = UNSAFE.getByte(index);
+
+            if (next != '.') {
+                temperature = (temperature * 10) + (next - '0');
+                index++;
+            }
+
+            // skip the .
+            index++;
+            next = UNSAFE.getByte(index);
+
+            // add the last digit to temperature
+            temperature = (temperature * 10) + (next - '0');
+            index++;
+
+            // negate the temperature if needed
+            if (negative) {
+                temperature = -temperature;
+            }
+
+            // skip the newline
+            index++;
+
+            // insert into map
+            for (int i = hash; i < hash + MAP_SIZE; i++) {
+                int mapIndex = i & MAP_MASK;
+                if (keys[mapIndex] == -1) {
+                    Result result = preCreatedResults[usedPreCreatedResults++];
+                    result.nameStart = nameStart;
+                    result.nameSize = nameEnd - nameStart;
+                    result.min = temperature;
+                    result.max = temperature;
+                    result.sum = temperature;
+                    result.count = 1;
+
+                    keys[mapIndex] = hash;
+                    values[mapIndex] = result;
+                    break;
+                }
+                if (keys[mapIndex] == hash) {
+                    Result result = values[mapIndex];
+                    result.min = Math.min(result.min, temperature);
+                    result.max = Math.max(result.max, temperature);
+                    result.sum += temperature;
+                    result.count++;
+                    break;
+                }
+            }
+        }
+
+        threadResult.results = Arrays.stream(values).filter(Objects::nonNull).toArray(Result[]::new);
+
+        for (Result result : threadResult.results) {
+            result.name = new String(memorySegment.asSlice(result.nameStart - address, result.nameSize).toArray(DATA_LAYOUT));
+        }
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        int numberOfChunks = Runtime.getRuntime().availableProcessors();
+
+        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+
+            long fileSize = fileChannel.size();
+            MemorySegment allData = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
+
+            long segmentSize = (fileSize + numberOfChunks - 1) / numberOfChunks;
+            long[] segmentBounds = new long[numberOfChunks + 1];
+
+            segmentBounds[0] = 0;
+            for (int i = 1; i < numberOfChunks; i++) {
+                long chunkAddress = i * segmentSize;
+                while (chunkAddress < fileSize && allData.getAtIndex(DATA_LAYOUT, chunkAddress++) != '\n') {
+                }
+                segmentBounds[i] = Math.min(chunkAddress, fileSize);
+            }
+            segmentBounds[numberOfChunks] = fileSize;
+
+            ThreadResult[] threadResults = IntStream.range(0, numberOfChunks)
+                    .parallel()
+                    .mapToObj(i -> {
+                        long size = segmentBounds[i + 1] - segmentBounds[i];
+                        long offset = segmentBounds[i];
+                        MemorySegment segment = allData.asSlice(offset, size);
+                        ThreadResult result = new ThreadResult();
+                        process(segment, result);
+                        return result;
+                    })
+                    .toArray(ThreadResult[]::new);
+
+            HashMap<String, Result> combinedResults = new HashMap<>(1024);
+
+            for (int i = 0; i < numberOfChunks; i++) {
+                for (Result result : threadResults[i].results) {
+                    if (!combinedResults.containsKey(result.name)) {
+                        Result newResult = new Result();
+                        newResult.name = result.name;
+                        newResult.min = result.min;
+                        newResult.max = result.max;
+                        newResult.sum = result.sum;
+                        newResult.count = result.count;
+                        combinedResults.put(result.name, newResult);
+                    }
+                    else {
+                        Result existingResult = combinedResults.get(result.name);
+                        existingResult.min = Math.min(existingResult.min, result.min);
+                        existingResult.max = Math.max(existingResult.max, result.max);
+                        existingResult.sum += result.sum;
+                        existingResult.count += result.count;
+                    }
+                }
+            }
+
+            Result[] sortedResults = combinedResults.values().toArray(Result[]::new);
+            Arrays.sort(sortedResults, Comparator.comparing(result -> result.name));
+
+            System.out.print("{");
+
+            for (int i = 0; i < sortedResults.length; i++) {
+                Result sortedResult = sortedResults[i];
+                if (i != 0) {
+                    System.out.print(", ");
+                }
+                System.out.printf(Locale.US, "%s=%.1f/%.1f/%.1f", sortedResult.name, sortedResult.min(), sortedResult.mean(), sortedResult.max());
+            }
+
+            System.out.printf("}\n");
+        }
+    }
+}

From f9fb9bb3848d65da796592e98e9daa77ba5b886f Mon Sep 17 00:00:00 2001
From: unbounded <haakhi@gmail.com>
Date: Sun, 14 Jan 2024 19:11:57 +0100
Subject: [PATCH 009/268] Add implementation for user unbounded (#394)

Implementation that uses the Vector API for the following
 - scan for separators
 - calculate hash
 - n-way lookup in hash table
 - parse digits

e; fix queue size
---
 calculate_average_unbounded.sh                |  19 +
 .../onebrc/CalculateAverage_unbounded.java    | 437 ++++++++++++++++++
 2 files changed, 456 insertions(+)
 create mode 100755 calculate_average_unbounded.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java

diff --git a/calculate_average_unbounded.sh b/calculate_average_unbounded.sh
new file mode 100755
index 000000000..ab874052e
--- /dev/null
+++ b/calculate_average_unbounded.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --add-modules jdk.incubator.vector -XX:-TieredCompilation  -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_unbounded
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java b/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java
new file mode 100644
index 000000000..351dc49a5
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java
@@ -0,0 +1,437 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import jdk.incubator.vector.*;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.lang.foreign.ValueLayout.*;
+import static java.nio.ByteOrder.BIG_ENDIAN;
+
+public class CalculateAverage_unbounded {
+    private static final Path FILE = Path.of("./measurements.txt");
+    private static final int MAX_STATION_NAME_LEN = 100;
+    private static final int MAX_UNIQUE_STATIONS = 10000;
+
+    // this is *really* expensive
+    private static final OfInt BIG_ENDIAN_INT = JAVA_INT_UNALIGNED.withOrder(BIG_ENDIAN);
+    private static final VectorSpecies<Byte> LINE_SCAN_SPECIES = ByteVector.SPECIES_256;
+    private static final int LINE_SCAN_LEN = LINE_SCAN_SPECIES.length();
+    private static final VectorSpecies<Integer> NAME_HASH_SPECIES = IntVector.SPECIES_256;
+    private static final VectorSpecies<Short> HASH_LOOKUP_SPECIES = ShortVector.SPECIES_256;
+    private static final VectorSpecies<Long> ACCUMULATOR_SPECIES = LongVector.SPECIES_256;
+
+    private static final int CHUNK_SIZE = 16 * 1024 * 1024;
+
+    // Arbitrarily chosen primes
+    private static final int[] HASH_PRIMES = { 661, 1663, 2293, 3581, 5449, 5953, 6311, 6841, 7573, 7669, 7703, 7789, 7901, 8887, 8581, 8831 };
+    private static final byte[] PREFIX_MASK = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
+    private static final int[] DIGIT_MULTIPLIERS = {
+            0, 10, 1, 1,
+            100, 10, 1, 1,
+            0, -10, 1, -1,
+            -100, -10, 1, -1,
+    };
+    private static final int[] DIGIT_MASK = {
+            0x000fff0f,
+            0x0f0fff0f,
+            0x000fff0f,
+            0x0f0fff0f,
+    };
+    private static final int[] DIGIT_FLIPS = { 0, 0, -1, -1 };
+
+    record Segment(long start, int len) {
+    }
+
+    static class StationStat {
+        long count;
+        long totalTemp;
+        int min;
+        int max;
+
+        StationStat(long count, long totalTemp, int min, int max) {
+            this.count = count;
+            this.totalTemp = totalTemp;
+            this.min = min;
+            this.max = max;
+        }
+
+        StationStat merge(StationStat other) {
+            this.count += other.count;
+            this.totalTemp += other.totalTemp;
+            this.min = Math.min(this.min, other.min);
+            this.max = Math.max(this.max, other.max);
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            return STR."\{min/10.0}/\{Math.round(1.0 * totalTemp / count)/10.0}/\{max/10.0}";
+        }
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        long fileSize = Files.size(FILE);
+        int lastChunkSize = (int) Math.min(200, fileSize);
+        int numSegments = (int) (fileSize / CHUNK_SIZE + 10);
+
+        var segments = new ArrayBlockingQueue<Segment>((int) (fileSize / CHUNK_SIZE + 10));
+        for (long i = 0; i < fileSize - lastChunkSize; i += CHUNK_SIZE) {
+            segments.put(new Segment(i, (int) Math.min(CHUNK_SIZE, fileSize - i - lastChunkSize)));
+        }
+
+        int numThreads = Runtime.getRuntime().availableProcessors();
+        var results = new ArrayBlockingQueue<Map<String, StationStat>>(numThreads);
+        var toMerge = new ArrayList<Map<String, StationStat>>(numThreads + 1);
+        try (var ch = FileChannel.open(FILE, StandardOpenOption.READ); var arena = Arena.ofConfined()) {
+            var threads = IntStream.range(0, numThreads).mapToObj((ignored) -> new ProcessorThread(segments, ch, results::add)).toList();
+            threads.forEach(Thread::start);
+
+            // Process last piece without OOB
+            int margin = lastChunkSize < fileSize ? 1 : 0;
+            var mem = ch.map(FileChannel.MapMode.READ_ONLY, fileSize - lastChunkSize - margin, lastChunkSize + margin, arena);
+            slowProcessChunk(mem, margin, lastChunkSize, toMerge::add);
+
+            for (var thread : threads) {
+                thread.join();
+            }
+        }
+
+        results.drainTo(toMerge);
+        var merged = toMerge.stream().reduce((a, b) -> {
+            b.forEach((k, v) -> a.merge(k, v, StationStat::merge));
+            return a;
+        }).get();
+        printResult(merged);
+    }
+
+    // Simple implementation for the end - so we don't need to worry about reading past the end of the file
+    private static void slowProcessChunk(MemorySegment mem, int startPos, int endPos, Consumer<Map<String, StationStat>> report) {
+        int index = scanForStartPos(mem, startPos);
+        byte[] nameBuf = new byte[MAX_STATION_NAME_LEN];
+        while (index < endPos) {
+            int nameLen = 0;
+            while (mem.get(JAVA_BYTE, index) != ';') {
+                nameBuf[nameLen++] = mem.get(JAVA_BYTE, index);
+                index++;
+            }
+            var name = new String(nameBuf, 0, nameLen);
+            index++;
+            StringBuilder numStr = new StringBuilder(5);
+            while (mem.get(JAVA_BYTE, index) != '\n') {
+                if (mem.get(JAVA_BYTE, index) != '.') {
+                    numStr.append((char) mem.get(JAVA_BYTE, index));
+                }
+                index++;
+            }
+            index++;
+            int num = Integer.parseInt(numStr.toString());
+            var entry = new HashMap<String, StationStat>(1);
+            entry.put(name, new StationStat(1, num, num, num));
+            report.accept(entry);
+        }
+    }
+
+    static class ProcessorThread extends Thread {
+
+        static final int NUM_BUCKETS = 1024;
+        static final int BUCKET_MASK = 0x3ff;
+        static final int BUCKET_SIZE = 16;
+
+        // n-way hash table state
+        // 16 buckets, then 16 name pointers
+        private final short[] hashTable = new short[2 * BUCKET_SIZE * NUM_BUCKETS];
+        // storage of station name keys for hash collision check
+        private final byte[] nameTable = new byte[MAX_UNIQUE_STATIONS * (MAX_STATION_NAME_LEN + 1)];
+        // values for the hash key stable
+        private final short[] stationIndexes = new short[BUCKET_SIZE * NUM_BUCKETS];
+        private final int[] nextNamePos = { 0 };
+        private final int[] nextStationIndex = { 0 };
+
+        // Accumulator for (10s, 1s, (count*-2), .1s) per station
+        private final long[] accumulators = new long[4 * MAX_UNIQUE_STATIONS];
+        // min and max per station
+        private final int[] minMax = new int[2 * MAX_UNIQUE_STATIONS];
+
+        private final Queue<Segment> segments;
+        private final FileChannel channel;
+        private final Consumer<Map<String, StationStat>> report;
+
+        ProcessorThread(Queue<Segment> segments, FileChannel channel, Consumer<Map<String, StationStat>> report) {
+            this.segments = segments;
+            this.channel = channel;
+            this.report = report;
+            for (int i = 0; i < minMax.length; i += 2) {
+                minMax[i] = Integer.MAX_VALUE;
+                minMax[i + 1] = Integer.MIN_VALUE;
+            }
+        }
+
+        @Override
+        public void run() {
+            try {
+                while (true) {
+                    var segment = segments.poll();
+                    if (segment == null) {
+                        break;
+                    }
+                    int startMargin = segment.start == 0 ? 0 : 1;
+                    int endMargin = 64;
+                    try (var arena = Arena.ofConfined()) {
+                        var mem = channel.map(FileChannel.MapMode.READ_ONLY, segment.start - startMargin, segment.len + endMargin + startMargin, arena);
+                        processChunk(mem, startMargin, segment.len + startMargin, hashTable, nameTable, stationIndexes, minMax, accumulators, nextNamePos, nextStationIndex);
+                    }
+                }
+                report.accept(decodeResult(hashTable, nameTable, stationIndexes, accumulators, minMax));
+            } catch (IOException e) {
+                System.err.println(STR."I/O Exception: \{e}");
+                throw new RuntimeException(e);
+            }
+        }
+
+        private static void processChunk(MemorySegment mem, int startPos, int endPos, short[] hashTable, byte[] nameTable, short[] stationIndexes, int[] minMax,
+                                         long[] accumulators, int[] nextNamePos, int[] nextStationIndex) {
+            int index = scanForStartPos(mem, startPos);
+            var primeVec = IntVector.fromArray(NAME_HASH_SPECIES, HASH_PRIMES, 0);
+            while (index < endPos) {
+                var lineVec = ByteVector.fromMemorySegment(LINE_SCAN_SPECIES, mem, index, ByteOrder.LITTLE_ENDIAN);
+                int numPos = lineVec.eq((byte) ';').firstTrue() + 1;
+                int nlPos = 0;
+                int stationIndex;
+                if (numPos != LINE_SCAN_LEN + 1) {
+                    // Fast path, station name fits in one SIMD register
+                    nlPos = lineVec.eq((byte) '\n').firstTrue();
+                    if (nlPos == LINE_SCAN_LEN) {
+                        while (mem.get(JAVA_BYTE, index + nlPos) != '\n') {
+                            nlPos++;
+                        }
+                    }
+                    var nameVec = lineVec.and(ByteVector.fromArray(LINE_SCAN_SPECIES, PREFIX_MASK, 33 - numPos));
+                    int nameHash = nameVec.reinterpretAsInts().mul(primeVec).reduceLanes(VectorOperators.ADD);
+
+                    stationIndex = fastLookupHash(nameHash, nameVec, hashTable, nameTable, stationIndexes, nextNamePos, nextStationIndex);
+                }
+                else {
+                    // Slow path, station name larger than SIMD register
+                    while (mem.get(JAVA_BYTE, index + numPos - 1) != ';')
+                        numPos++;
+                    while (mem.get(JAVA_BYTE, index + nlPos) != '\n')
+                        nlPos++;
+
+                    int nameHash = lineVec.reinterpretAsInts().mul(primeVec).reduceLanes(VectorOperators.ADD);
+                    for (int i = LINE_SCAN_LEN; i < numPos - 1; i++) {
+                        nameHash = nameHash * 33 + mem.get(JAVA_BYTE, index + i);
+                    }
+                    stationIndex = lookupHash(nameHash, mem.asSlice(index, numPos - 1), hashTable, nameTable, stationIndexes, nextNamePos, nextStationIndex);
+                }
+                boolean isNegative = mem.get(JAVA_BYTE, index + numPos) == '-';
+                // format; 0: 9.9, 1: 99.9, 2: -9.9, 3: -99.9
+                int numFormat = nlPos - numPos - 3 + (isNegative ? 1 : 0);
+
+                // accumulate sums for mean
+                var numPartsVec = ByteVector.fromMemorySegment(ByteVector.SPECIES_128, mem, index + nlPos - 4, ByteOrder.LITTLE_ENDIAN)
+                        .sub((byte) '0')
+                        .convert(VectorOperators.B2I, 0);
+                var multiplyVec = IntVector.fromArray(IntVector.SPECIES_128, DIGIT_MULTIPLIERS, 4 * numFormat);
+                var toAdd = numPartsVec.mul(multiplyVec).castShape(ACCUMULATOR_SPECIES, 0);
+                var acc = LongVector.fromArray(ACCUMULATOR_SPECIES, accumulators, 4 * stationIndex);
+                acc.add(toAdd).intoArray(accumulators, 4 * stationIndex);
+
+                // record min/max
+                // encode ASCII value to sortable format without parsing
+                int encoded = (mem.get(BIG_ENDIAN_INT, index + nlPos - 4) & DIGIT_MASK[numFormat]) ^ DIGIT_FLIPS[numFormat];
+                minMax[2 * stationIndex] = Math.min(minMax[2 * stationIndex], encoded);
+                minMax[2 * stationIndex + 1] = Math.max(minMax[2 * stationIndex + 1], encoded);
+
+                index += nlPos + 1;
+            }
+        }
+
+        // Look up name that fits in a vector
+        private static int fastLookupHash(int nameHash, ByteVector nameVec, short[] hashTable, byte[] nameTable, short[] stationIndexes, int[] nextNamePos,
+                                          int[] nextStationIndex) {
+            int bucketIdx = nameHash & BUCKET_MASK;
+            short shortHash = (short) (0x8000 | (nameHash >> 16));
+
+            // Look up the station name to find the index
+            while (true) {
+                var bucketVec = ShortVector.fromArray(HASH_LOOKUP_SPECIES, hashTable, 2 * BUCKET_SIZE * bucketIdx);
+                var bucketPos = bucketVec.eq(shortHash).firstTrue();
+                if (bucketPos != HASH_LOOKUP_SPECIES.length()) {
+                    int slotNamePos = 32 * Short.toUnsignedInt(hashTable[2 * BUCKET_SIZE * bucketIdx + BUCKET_SIZE + bucketPos]);
+                    var slotNameVec = ByteVector.fromArray(LINE_SCAN_SPECIES, nameTable, slotNamePos);
+                    if (nameVec.eq(slotNameVec).allTrue()) {
+                        // Hit
+                        return stationIndexes[BUCKET_SIZE * bucketIdx + bucketPos];
+                    }
+                    else {
+                        bucketPos = handleHashCollision(shortHash, bucketIdx, MemorySegment.ofArray(nameVec.toArray()), hashTable, nameTable);
+                        if (bucketPos != -1) {
+                            return stationIndexes[BUCKET_SIZE * bucketIdx + bucketPos];
+                        }
+                    }
+                }
+                var emptyPos = bucketVec.eq((short) 0).firstTrue();
+                if (emptyPos != HASH_LOOKUP_SPECIES.length()) {
+                    // Miss, insert
+                    int stationIndex = nextStationIndex[0]++;
+                    nameVec.intoArray(nameTable, nextNamePos[0]);
+                    hashTable[2 * BUCKET_SIZE * bucketIdx + emptyPos] = shortHash;
+                    hashTable[2 * BUCKET_SIZE * bucketIdx + BUCKET_SIZE + emptyPos] = (short) (nextNamePos[0] / 32);
+                    stationIndexes[BUCKET_SIZE * bucketIdx + emptyPos] = (short) stationIndex;
+                    nextNamePos[0] += nameVec.length();
+                    return stationIndex;
+                }
+                // Try next bucket
+                bucketIdx = (bucketIdx + 1) & BUCKET_MASK;
+            }
+        }
+
+        // Look up long name
+        private static int lookupHash(int nameHash, MemorySegment nameSeg, short[] hashTable, byte[] nameTable, short[] stationIndexes, int[] nextNamePos,
+                                      int[] nextStationIndex) {
+            int bucketIdx = nameHash & BUCKET_MASK;
+            short shortHash = (short) (0x8000 | (nameHash >> 16));
+
+            // Look up the station name to find the index
+            while (true) {
+                var bucketVec = ShortVector.fromArray(HASH_LOOKUP_SPECIES, hashTable, 2 * BUCKET_SIZE * bucketIdx);
+                var bucketPos = bucketVec.eq(shortHash).firstTrue();
+                if (bucketPos != HASH_LOOKUP_SPECIES.length()) {
+                    int slotNamePos = 32 * Short.toUnsignedInt(hashTable[2 * BUCKET_SIZE * bucketIdx + BUCKET_SIZE + bucketPos]);
+                    boolean match = true;
+                    for (int i = 0; i < nameSeg.byteSize(); i++) {
+                        if (nameSeg.get(JAVA_BYTE, i) != nameTable[slotNamePos + i]) {
+                            match = false;
+                        }
+                    }
+                    match = match && nameTable[slotNamePos + (int) nameSeg.byteSize()] == '\0';
+                    if (match) {
+                        // Hit
+                        return stationIndexes[BUCKET_SIZE * bucketIdx + bucketPos];
+                    }
+                    else {
+                        bucketPos = handleHashCollision(shortHash, bucketIdx, nameSeg, hashTable, nameTable);
+                        if (bucketPos != -1) {
+                            return stationIndexes[BUCKET_SIZE * bucketIdx + bucketPos];
+                        }
+                    }
+                }
+                var emptyPos = bucketVec.eq((short) 0).firstTrue();
+                if (emptyPos != HASH_LOOKUP_SPECIES.length()) {
+                    // Miss, insert
+                    int stationIndex = nextStationIndex[0]++;
+                    hashTable[2 * BUCKET_SIZE * bucketIdx + emptyPos] = shortHash;
+                    hashTable[2 * BUCKET_SIZE * bucketIdx + BUCKET_SIZE + emptyPos] = (short) (nextNamePos[0] / 32);
+                    stationIndexes[BUCKET_SIZE * bucketIdx + emptyPos] = (short) stationIndex;
+                    for (int i = 0; i < nameSeg.byteSize(); i++) {
+                        nameTable[nextNamePos[0]++] = nameSeg.get(JAVA_BYTE, i);
+                    }
+                    nameTable[nextNamePos[0]++] = '\0';
+                    while (nextNamePos[0] % 32 != 0)
+                        nextNamePos[0]++;
+                    return stationIndex;
+                }
+                // Try next bucket
+                bucketIdx = (bucketIdx + 1) & BUCKET_MASK;
+            }
+        }
+
+        private static int handleHashCollision(short shortHash, int bucketIdx, MemorySegment nameSeg, short[] hashTable, byte[] nameTable) {
+            for (int i = 0; i < BUCKET_SIZE; i++) {
+                if (hashTable[2 * BUCKET_SIZE * bucketIdx + i] == shortHash) {
+                    int namePos = 32 * Short.toUnsignedInt(hashTable[2 * BUCKET_SIZE * bucketIdx + BUCKET_SIZE + i]);
+                    if (Arrays.equals(nameSeg.toArray(JAVA_BYTE), Arrays.copyOfRange(nameTable, namePos, namePos + (int) nameSeg.byteSize()))
+                            && nameTable[namePos + (int) nameSeg.byteSize()] == '\0') {
+                        return i;
+                    }
+                }
+            }
+            return -1;
+        }
+    }
+
+    // Find next record
+    private static int scanForStartPos(MemorySegment mem, int startPos) {
+        if (startPos == 0) {
+            return startPos;
+        }
+        while (mem.get(JAVA_BYTE, startPos - 1) != '\n') {
+            startPos++;
+        }
+        return startPos;
+    }
+
+    // Decode the accumulator values to StationStats
+    private static Map<String, StationStat> decodeResult(short[] hashTable, byte[] nameTable, short[] stationIndexes, long[] accumulators, int[] minMax) {
+        var result = new HashMap<String, StationStat>(MAX_UNIQUE_STATIONS);
+        for (int i = 0; i < hashTable.length; i += 32) {
+            for (int j = 0; j < 16; j++) {
+                if (hashTable[i + j] != 0) {
+                    int namePos = 32 * Short.toUnsignedInt(hashTable[i + j + 16]);
+                    int nameLen = 1;
+                    while (nameTable[namePos + nameLen] != '\0') {
+                        nameLen++;
+                    }
+                    int stationIdx = stationIndexes[i / 2 + j];
+                    // Number of '-2' valued dots seen
+                    long count = accumulators[4 * stationIdx + 2] / -2;
+                    long total = accumulators[4 * stationIdx];
+                    total += accumulators[4 * stationIdx + 1];
+                    total += accumulators[4 * stationIdx + 3];
+                    int min = decodeInteger(minMax[2 * stationIdx]);
+                    int max = decodeInteger(minMax[2 * stationIdx + 1]);
+                    result.put(new String(nameTable, namePos, nameLen), new StationStat(count, total, min, max));
+                }
+            }
+        }
+        return result;
+    }
+
+    private static int decodeInteger(int encoded) {
+        int mask = encoded >> 31;
+        int orig = (encoded ^ mask) & 0x7fffffff;
+        int val = (orig & 0xff) + ((orig >> 16) & 0xff) * 10 + ((orig >> 24) & 0xff) * 100;
+        return val * (mask | 1);
+    }
+
+    private static void printResult(Map<String, StationStat> stats) {
+        System.out.print("{");
+        System.out.print(
+            stats.keySet().stream().sorted()
+                    .map(key -> {
+                        var s = stats.get(key);
+                        return STR."\{key}=\{s}";
+                    })
+                    .collect(Collectors.joining(", "))
+        );
+        System.out.println("}");
+    }
+}

From 5fb121806461364e7c16b14c1d8f282e532e1f4a Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 14 Jan 2024 19:12:29 +0100
Subject: [PATCH 010/268] Leaderboard update

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 083400dee..9cfc930ed 100644
--- a/README.md
+++ b/README.md
@@ -48,12 +48,15 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.321 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
 |   | 00:03.539 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.362 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:04.726 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  | 
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
 |   | 00:05.218 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
+|   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
+|   | 00:05.339 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
 |   | 00:05.530 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
@@ -61,9 +64,11 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.140 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
+|   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) |  |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
+|   | 00:06.946 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.809 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
@@ -74,6 +79,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.489 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnabyl.java)| 21.0.1-graal | [Bang NGUYEN](https://github.com/gnabyl) |  |
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) |  |
 |   | 00:08.689 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
+|   | 00:08.752 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
 |   | 00:08.892 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fatroom.java)| 21.0.1-open | [Roman Romanchuk](https://github.com/fatroom) |  |
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
 |   | 00:09.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielreid.java)| 21.0.1-open | [Gabriel Reid](https://github.com/gabrielreid) |  |
@@ -89,7 +95,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
-|   | 00:11.987 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) |  |
+|   | 00:12.051 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.565 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
@@ -102,6 +108,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
 |   | 00:16.379 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
+|   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.717 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_omarchenko4j.java)| 21.0.1-open | [Oleh Marchenko](https://github.com/omarchenko4j) |  |
 |   | 00:17.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hallvard.java)| 21.0.1-open | [Hallvard Trætteberg](https://github.com/hallvard) |  |
 |   | 00:18.251 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_seijikun.java)| 21.0.1-graal | [Markus Ebner](https://github.com/seijikun) |  |

From 990f884ff1ba243ed752e7118fa07770870bc426 Mon Sep 17 00:00:00 2001
From: zerninv <zerninvasilii@yandex.ru>
Date: Sun, 14 Jan 2024 19:47:42 +0000
Subject: [PATCH 011/268] change temperature parsing approach (#405)

---
 .../onebrc/CalculateAverage_zerninv.java      | 50 ++++++++++++-------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
index cd4e3d74e..789db7398 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
@@ -33,11 +33,18 @@
 public class CalculateAverage_zerninv {
     private static final String FILE = "./measurements.txt";
     private static final int MIN_FILE_SIZE = 1024 * 1024 * 16;
-    private static final char DELIMITER = ';';
-    private static final char LINE_SEPARATOR = '\n';
-    private static final char ZERO = '0';
-    private static final char NINE = '9';
-    private static final char MINUS = '-';
+
+    // #.##
+    private static final int THREE_DIGITS_MASK = 0x2e0000;
+    // #.#
+    private static final int TWO_DIGITS_MASK = 0x2e00;
+    // #.#-
+    private static final int TWO_NEGATIVE_DIGITS_MASK = 0x2e002d;
+    private static final int BYTE_MASK = 0xff;
+    private static final int ZERO = '0';
+
+    private static final byte DELIMITER = ';';
+    private static final byte LINE_SEPARATOR = '\n';
 
     private static final Unsafe UNSAFE = initUnsafe();
 
@@ -111,7 +118,7 @@ private static Map<String, MeasurementAggregation> calcForChunk(long offset, lon
         var results = new MeasurementContainer();
 
         long cityOffset;
-        int hashCode, temperature, multiplier;
+        int hashCode, temperature, word;
         byte cityNameSize, b;
 
         while (offset < end) {
@@ -122,18 +129,27 @@ private static Map<String, MeasurementAggregation> calcForChunk(long offset, lon
             }
             cityNameSize = (byte) (offset - cityOffset - 1);
 
-            multiplier = 1;
-            temperature = UNSAFE.getByte(offset++) - ZERO;
-            if (temperature == MINUS - ZERO) {
-                multiplier = -1;
-                temperature = 0;
+            word = UNSAFE.getInt(offset);
+            offset += 4;
+
+            if ((word & TWO_NEGATIVE_DIGITS_MASK) == TWO_NEGATIVE_DIGITS_MASK) {
+                word >>>= 8;
+                temperature = ZERO * 11 - ((word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK));
             }
-            while ((b = UNSAFE.getByte(offset++)) != LINE_SEPARATOR) {
-                if (b >= ZERO && b <= NINE) {
-                    temperature = temperature * 10 + (b - ZERO);
-                }
+            else if ((word & THREE_DIGITS_MASK) == THREE_DIGITS_MASK) {
+                temperature = (word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK) - ZERO * 111;
+            }
+            else if ((word & TWO_DIGITS_MASK) == TWO_DIGITS_MASK) {
+                temperature = (word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK) - ZERO * 11;
+                offset--;
+            }
+            else {
+                // #.##-
+                word = (word >>> 8) | (UNSAFE.getByte(offset++) << 24);
+                temperature = ZERO * 111 - ((word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK));
             }
-            results.put(cityOffset, cityNameSize, hashCode, (short) (temperature * multiplier));
+            offset++;
+            results.put(cityOffset, cityNameSize, hashCode, (short) temperature);
         }
         return results.toStringMap();
     }
@@ -255,4 +271,4 @@ private String createString(long address, byte size) {
             return new String(arr);
         }
     }
-}
+}
\ No newline at end of file

From bb5679f46318279e10d68a69b3da8e2c2b1274fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bruno=20F=C3=A9lix?= <felix19350@gmail.com>
Date: Sun, 14 Jan 2024 20:56:11 +0100
Subject: [PATCH 012/268] Further improved performance by improving the parsing
 logic so that strings for city names are not allocated with each row. (#323)

Co-authored-by: Bruno Felix <bruno.felix@klarna.com>
---
 calculate_average_felix19350.sh               |  12 +-
 .../onebrc/CalculateAverage_felix19350.java   | 230 +++++++++++-------
 2 files changed, 154 insertions(+), 88 deletions(-)

diff --git a/calculate_average_felix19350.sh b/calculate_average_felix19350.sh
index 4007d7564..e84f8371c 100755
--- a/calculate_average_felix19350.sh
+++ b/calculate_average_felix19350.sh
@@ -15,6 +15,16 @@
 #  limitations under the License.
 #
 
+# ParallelGC test - Time (measured by evaluate2.sh): 00:33.130
+# JAVA_OPTS="--enable-preview -XX:+UseParallelGC -XX:+UseTransparentHugePages"
+
+# G1GC test - Time (measured by evaluate2.sh):  00:26.447
+# JAVA_OPTS="--enable-preview -XX:+UseG1GC -XX:+UseTransparentHugePages"
+
+# ZGC test - Time (measured by evaluate2.sh): 00:22.813
+JAVA_OPTS="--enable-preview -XX:+UseZGC -XX:+UseTransparentHugePages"
+
+# EpsilonGC test - for now doesnt work because heap space gets exhausted
+#JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
 
-JAVA_OPTS="--enable-preview -XX:+UseParallelGC -Xms4g -Xmx4g"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_felix19350
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java b/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java
index c54976d58..8c047b72a 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java
@@ -16,17 +16,16 @@
 package dev.morling.onebrc;
 
 import java.io.IOException;
-import java.io.RandomAccessFile;
 import java.lang.foreign.Arena;
-import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.TreeMap;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.Executors;
 import java.util.stream.Collectors;
@@ -36,6 +35,55 @@ public class CalculateAverage_felix19350 {
     private static final String FILE = "./measurements.txt";
     private static final int NEW_LINE_SEEK_BUFFER_LEN = 128;
 
+    private static final int EXPECTED_MAX_NUM_CITIES = 15_000; // 10K cities + a buffer no to trigger the load factor
+
+    private static class CityRef {
+
+        final int length;
+        final int fingerprint;
+        final byte[] stringBytes;
+
+        public CityRef(ByteBuffer byteBuffer, int startIdx, int length, int fingerprint) {
+            this.length = length;
+            this.stringBytes = new byte[length];
+            byteBuffer.get(startIdx, this.stringBytes, 0, this.stringBytes.length);
+            this.fingerprint = fingerprint;
+        }
+
+        public String cityName() {
+            return new String(stringBytes, StandardCharsets.UTF_8);
+        }
+
+        @Override
+        public int hashCode() {
+            return fingerprint;
+        }
+
+        @Override
+        public boolean equals(Object other) {
+            if (other instanceof CityRef otherRef) {
+                if (fingerprint != otherRef.fingerprint) {
+                    return false;
+                }
+
+                if (this.length != otherRef.length) {
+                    return false;
+                }
+
+                for (var i = 0; i < this.length; i++) {
+                    if (this.stringBytes[i] != otherRef.stringBytes[i]) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+            else {
+                return false;
+            }
+        }
+
+    }
+
     private static class ResultRow {
 
         private int min;
@@ -73,95 +121,104 @@ public void mergeResult(ResultRow value) {
         }
     }
 
-  private record AverageAggregatorTask(MemorySegment memSegment) {
+  private record AverageAggregatorTask(ByteBuffer byteBuffer) {
+    private static final int HASH_FACTOR = 31; // Mersenne prime
 
-    public static Stream<AverageAggregatorTask> createStreamOf(List<MemorySegment> memorySegments) {
-      return memorySegments.stream().map(AverageAggregatorTask::new);
+
+    public static Stream<AverageAggregatorTask> createStreamOf(List<ByteBuffer> byteBuffers) {
+      return byteBuffers.stream().map(AverageAggregatorTask::new);
     }
 
-    public Map<String, ResultRow> processChunk() {
-      final var result = new TreeMap<String, ResultRow>();
-      var offset = 0L;
-      var lineStart = 0L;
-      while (offset < memSegment.byteSize()) {
-        byte nextByte = memSegment.get(ValueLayout.OfByte.JAVA_BYTE, offset);
-        if ((char) nextByte == '\n') {
-          this.processLine(result, memSegment.asSlice(lineStart, (offset - lineStart)).asByteBuffer());
-          lineStart = offset + ValueLayout.JAVA_BYTE.byteSize();
-        }
-        offset += ValueLayout.OfByte.JAVA_BYTE.byteSize();
+    public Map<CityRef, ResultRow> processChunk() {
+      final var measurements = new HashMap<CityRef, ResultRow>(EXPECTED_MAX_NUM_CITIES);
+      var lineStart = 0;
+      // process line by line playing with the fact that a line is no longer than 106 bytes
+      // 100 bytes for city name + 1 byte for separator + 1 bytes for negative sign + 4 bytes for number
+      while (lineStart < byteBuffer.limit()) {
+        lineStart = this.processLine(measurements, byteBuffer, lineStart);
       }
-
-      return result;
+      return measurements;
     }
 
-    private void processLine(Map<String, ResultRow> result, ByteBuffer lineBytes) {
+    private int processLine(Map<CityRef, ResultRow> measurements, ByteBuffer byteBuffer, int start) {
+      var fingerPrint = 0;
       var separatorIdx = -1;
-      for (int i = 0; i < lineBytes.limit(); i++) {
-        if ((char) lineBytes.get() == ';') {
-          separatorIdx = i;
-          lineBytes.clear();
-          break;
+      var sign = 1;
+      var value = 0;
+      var lineEnd = -1;
+      // Lines are processed in two stages:
+      // 1 - prior do the city name separator
+      // 2 - after the separator
+      // this ensures less if clauses
+
+      // stage 1 loop
+      {
+        for (int i = 0; i < NEW_LINE_SEEK_BUFFER_LEN; i++) {
+          final var currentByte = byteBuffer.get(start + i);
+          if (currentByte == ';') {
+            separatorIdx = i;
+            break;
+          } else {
+            fingerPrint = HASH_FACTOR * fingerPrint + currentByte;
+          }
         }
       }
-      assert (separatorIdx > 0);
 
-      var valueCapacity = lineBytes.capacity() - (separatorIdx + 1);
-      var cityBytes = new byte[separatorIdx];
-      var valueBytes = new byte[valueCapacity];
-      lineBytes.get(cityBytes, 0, separatorIdx);
-      lineBytes.get(separatorIdx + 1, valueBytes);
+      // stage 2 loop:
+      {
+        for (int i = separatorIdx + 1; i < NEW_LINE_SEEK_BUFFER_LEN; i++) {
+          final var currentByte = byteBuffer.get(start + i);
+          switch (currentByte) {
+            case '-':
+              sign = -1;
+              break;
+            case '.':
+              break;
+            case '\n':
+              lineEnd = start + i + 1;
+              break;
+            default:
+              // only digits are expected here
+              value = value * 10 + (currentByte - '0');
+          }
+
+          if (lineEnd != -1) {
+            break;
+          }
+        }
+      }
 
-      var city = new String(cityBytes, StandardCharsets.UTF_8);
-      var value = parseInt(valueBytes);
+      assert (separatorIdx > 0);
+      final var cityRef = new CityRef(byteBuffer, start, separatorIdx,fingerPrint);
+      value = sign * value;
 
-      var latestValue = result.get(city);
-      if (latestValue != null) {
-        latestValue.mergeValue(value);
+      final var existingMeasurement = measurements.get(cityRef);
+      if (existingMeasurement == null) {
+        measurements.put(cityRef, new ResultRow(value));
       } else {
-        result.put(city, new ResultRow(value));
+        existingMeasurement.mergeValue(value);
       }
-    }
 
-    private static int parseInt(byte[] valueBytes) {
-      int multiplier = 1;
-      int digitValue = 0;
-      var numDigits = valueBytes.length-1; // there is always one decimal place
-      var ds = new int[]{1,10,100};
-
-      for (byte valueByte : valueBytes) {
-        switch ((char) valueByte) {
-          case '-':
-            multiplier = -1;
-            numDigits -= 1;
-            break;
-          case '.':
-            break;
-          default:
-            digitValue += ((int) valueByte - 48) * (ds[numDigits - 1]);
-            numDigits -= 1;
-            break;// TODO continue here
-        }
-      }
-      return multiplier*digitValue;
+      return lineEnd; //to account for the line end
     }
   }
 
     public static void main(String[] args) throws IOException {
         // memory map the files and divide by number of cores
-        var numProcessors = Runtime.getRuntime().availableProcessors();
-        var memorySegments = calculateMemorySegments(numProcessors);
-        var tasks = AverageAggregatorTask.createStreamOf(memorySegments);
-        assert (memorySegments.size() == numProcessors);
+        final var numProcessors = Runtime.getRuntime().availableProcessors();
+        final var byteBuffers = calculateMemorySegments(numProcessors);
+        final var tasks = AverageAggregatorTask.createStreamOf(byteBuffers);
+        assert (byteBuffers.size() <= numProcessors);
+        assert (!byteBuffers.isEmpty());
 
         try (var pool = Executors.newFixedThreadPool(numProcessors)) {
-            var results = tasks
+            final Map<CityRef, ResultRow> aggregatedCities = tasks
                     .parallel()
                     .map(task -> CompletableFuture.supplyAsync(task::processChunk, pool))
                     .map(CompletableFuture::join)
-                    .reduce(new TreeMap<>(), (partialMap, accumulator) -> {
-                        partialMap.forEach((key, value) -> {
-                            var prev = accumulator.get(key);
+                    .reduce(new HashMap<>(EXPECTED_MAX_NUM_CITIES), (currentMap, accumulator) -> {
+                        currentMap.forEach((key, value) -> {
+                            final var prev = accumulator.get(key);
                             if (prev == null) {
                                 accumulator.put(key, value);
                             }
@@ -172,6 +229,9 @@ public static void main(String[] args) throws IOException {
                         return accumulator;
                     });
 
+            var results = new HashMap<String, ResultRow>(EXPECTED_MAX_NUM_CITIES);
+            aggregatedCities.forEach((key, value) -> results.put(key.cityName(), value));
+
             System.out.print("{");
             String output = results.keySet()
                     .stream()
@@ -183,16 +243,16 @@ public static void main(String[] args) throws IOException {
         }
     }
 
-    private static List<MemorySegment> calculateMemorySegments(int numChunks) throws IOException {
-        try (RandomAccessFile raf = new RandomAccessFile(FILE, "r")) {
-            var result = new ArrayList<MemorySegment>(numChunks);
-            var chunks = new ArrayList<long[]>(numChunks);
+    private static List<ByteBuffer> calculateMemorySegments(int numChunks) throws IOException {
+        try (FileChannel fc = FileChannel.open(Paths.get(FILE))) {
+            var memMappedFile = fc.map(FileChannel.MapMode.READ_ONLY, 0L, fc.size(), Arena.ofAuto());
+            var result = new ArrayList<ByteBuffer>(numChunks);
 
-            var fileSize = raf.length();
-            var chunkSize = fileSize / numChunks;
+            var fileSize = fc.size();
+            var chunkSize = fileSize / numChunks; // TODO: if chunksize > MAX INT we will need to adjust
+            var previousChunkEnd = 0L;
 
             for (int i = 0; i < numChunks; i++) {
-                var previousChunkEnd = i == 0 ? 0L : chunks.get(i - 1)[1];
                 if (previousChunkEnd >= fileSize) {
                     // There is a scenario for very small files where the number of chunks may be greater than
                     // the number of lines.
@@ -205,31 +265,27 @@ private static List<MemorySegment> calculateMemorySegments(int numChunks) throws
                 }
                 else {
                     // all other chunks are end at a new line (\n)
-                    var theoreticalEnd = previousChunkEnd + chunkSize;
-                    var buffer = new byte[NEW_LINE_SEEK_BUFFER_LEN];
-                    raf.seek(theoreticalEnd);
-                    raf.read(buffer, 0, NEW_LINE_SEEK_BUFFER_LEN);
-
+                    var theoreticalEnd = Math.min(previousChunkEnd + chunkSize, fileSize);
                     var newLineOffset = 0;
-                    for (byte b : buffer) {
+                    for (int j = 0; j < NEW_LINE_SEEK_BUFFER_LEN; j++) {
+                        var candidateOffset = theoreticalEnd + j;
+                        if (candidateOffset >= fileSize) {
+                            break;
+                        }
+                        byte b = memMappedFile.get(ValueLayout.OfByte.JAVA_BYTE, candidateOffset);
                         newLineOffset += 1;
                         if ((char) b == '\n') {
                             break;
                         }
                     }
                     chunk[1] = Math.min(fileSize, theoreticalEnd + newLineOffset);
+                    previousChunkEnd = chunk[1];
                 }
 
-                assert (chunk[0] >= 0L);
-                assert (chunk[0] <= fileSize);
                 assert (chunk[1] > chunk[0]);
                 assert (chunk[1] <= fileSize);
 
-                var memMappedFile = raf.getChannel()
-                        .map(FileChannel.MapMode.READ_ONLY, chunk[0], (chunk[1] - chunk[0]), Arena.ofAuto());
-                memMappedFile.load();
-                chunks.add(chunk);
-                result.add(memMappedFile);
+                result.add(memMappedFile.asSlice(chunk[0], (chunk[1] - chunk[0])).asByteBuffer());
             }
             return result;
         }

From d262b3d1995fbd60e1865ecfeb94279de8e75c16 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 14 Jan 2024 21:06:37 +0100
Subject: [PATCH 013/268] Leaderboard

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9cfc930ed..0fe49aa5e 100644
--- a/README.md
+++ b/README.md
@@ -59,10 +59,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.339 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
 |   | 00:05.530 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
+|   | 00:05.351 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
-|   | 00:06.140 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) |  |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
@@ -120,6 +120,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:20.691 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java)| 21.0.1-graal | [Kidlike](https://github.com/Kidlike) | GraalVM native binary |
 |   | 00:21.989 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_couragelee.java)| 21.0.1-open | [couragelee](https://github.com/couragelee) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
+|   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
 |   | 00:32.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_padreati.java)| 21.0.1-open | [Aurelian Tutuianu](https://github.com/padreati) |  |
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |

From 61f5618ff259dca58ee334ecf3e491688215e339 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 14 Jan 2024 22:02:21 +0100
Subject: [PATCH 014/268] Adding list of external resources

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 0fe49aa5e..d9f882328 100644
--- a/README.md
+++ b/README.md
@@ -372,6 +372,21 @@ A: Probably not :) 1BRC results are reported in wallclock time, thus results of
 _Q: Why_ 1️⃣🐝🏎️ _?_\
 A: It's the abbreviation of the project name: **One** **B**illion **R**ow **C**hallenge.
 
+## 1BRC on the Web
+
+A list of external resources such as blog posts and videos, discussing 1BRC and specific implementations:
+
+* [Cliff Click discussing his 1BRC solution on the Coffee Compiler Club](https://www.youtube.com/watch?v=NJNIbgV6j-Y) (video)
+* [1️⃣🐝🏎️🦆 (1BRC in SQL with DuckDB)](https://rmoff.net/2024/01/03/1%EF%B8%8F%E2%83%A3%EF%B8%8F-1brc-in-sql-with-duckdb/), by Robin Moffatt (blog post)
+* [1 billion rows challenge in PostgreSQL and ClickHouse](https://ftisiot.net/posts/1brows/), by Francesco Tisiot (blog post)
+* [The One Billion Row Challenge with Snowflake](https://medium.com/snowflake/the-one-billion-row-challenge-with-snowflake-f612ae76dbd5), by Sean Falconer (blog post)
+* [One billion row challenge using base R](https://www.r-bloggers.com/2024/01/one-billion-row-challenge-using-base-r/), by  David Schoch (blog post)
+* [1 Billion Row Challenge with Apache Pinot](https://hubertdulay.substack.com/p/1-billion-row-challenge-in-apache), by Hubert Dulay (blog post)
+* [One Billion Row Challenge In C](https://www.dannyvankooten.com/blog/2024/1brc/), by Danny Van Kooten (blog post)
+* [One Billion Row Challenge in Racket](https://defn.io/2024/01/10/one-billion-row-challenge-in-racket/), by Bogdan Popa (blog post)
+* [The One Billion Row Challenge - .NET Edition](https://dev.to/mergeconflict/392-the-one-billion-row-challenge-net-edition), by Frank A. Krueger (podcast)
+* [One Billion Row Challenge](https://curiouscoding.nl/posts/1brc/), by Ragnar Groot Koerkamp (blog post)
+
 ## Sponsorship
 
 A big thank you to my employer [Decodable](https://www.decodable.co/) for funding the evaluation environment and supporting this challenge!

From cd0e20b304d92b2e0774cf20bd6b3c0ccae9b21f Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Mon, 15 Jan 2024 17:39:36 +0000
Subject: [PATCH 015/268] multithreaded version! (#415)

---
 .../onebrc/CalculateAverage_netrunnereve.java | 284 ++++++++++++------
 1 file changed, 185 insertions(+), 99 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java b/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java
index 7ff3cdd16..e323a32ad 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java
@@ -22,10 +22,14 @@
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.lang.Math;
+import java.util.Map;
+import java.util.TreeMap;
 
 public class CalculateAverage_netrunnereve {
 
     private static final String FILE = "./measurements.txt";
+    private static final int NUM_THREADS = 8; // test machine
+    private static final int LEN_EXTEND = 200; // guarantees a newline
 
     private static class MeasurementAggregator { // min, max, sum stored as 0.1/unit
         private MeasurementAggregator next = null; // linked list of entries for handling hash colisions
@@ -36,6 +40,12 @@ private static class MeasurementAggregator { // min, max, sum stored as 0.1/unit
         private int count = 0;
     }
 
+    private static class ThreadCalcs {
+        private MeasurementAggregator[] hashSpace = null;
+        private String[] staArr = null;
+        private int numStations = 0;
+    }
+
     // djb2 hash
     private static int calc_hash(byte[] input, int len) {
         int hash = 5831;
@@ -45,23 +55,139 @@ private static int calc_hash(byte[] input, int len) {
         return Math.abs(hash % 16384);
     }
 
-    public static void main(String[] args) {
-        try {
-            RandomAccessFile mraf = new RandomAccessFile(FILE, "r");
-            long fileSize = mraf.getChannel().size();
-            long bufSize = Integer.MAX_VALUE; // Java requirement is <= Integer.MAX_VALUE
-            int numStations = 0;
+    private static class ThreadedParser extends Thread {
+        private MappedByteBuffer mbuf;
+        private int mbs;
+        private ThreadCalcs[] threadOut;
+        private int threadID;
 
+        private ThreadedParser(MappedByteBuffer mbuf, int mbs, ThreadCalcs[] threadOut, int threadID) {
+            this.mbuf = mbuf;
+            this.mbs = mbs;
+            this.threadOut = threadOut;
+            this.threadID = threadID;
+        }
+
+        public void run() {
             MeasurementAggregator[] hashSpace = new MeasurementAggregator[16384]; // 14-bit hash
             byte[] scratch = new byte[100]; // <= 100 characters in station name
             String[] staArr = new String[10000]; // max 10000 station names
             MeasurementAggregator ma = null;
 
+            int numStations = 0;
+            boolean state = false; // 0 for station pickup, 1 for measurement pickup
+            int negMul = 1;
+            int head = 0;
+            int tempCnt = -1; // 0 if 1 digit measurement, 1 if 2 digit
+
+            for (int i = 0; i < mbs; i++) {
+                byte cur = mbuf.get(i);
+                if (state == true) {
+                    if (cur == 46) { // .
+                        int tempa = mbuf.get(i + 1) - 48;
+                        tempa += (scratch[0] - 48) * (10 + 90 * tempCnt) + (scratch[1] - 48) * (10 * tempCnt); // branchless
+                        tempa *= negMul;
+
+                        if (tempa < ma.min) {
+                            ma.min = tempa;
+                        }
+                        if (tempa > ma.max) {
+                            ma.max = tempa;
+                        }
+                        ma.sum += tempa;
+                        ma.count++;
+
+                        i += 2; // go to start of new line
+                        state = false;
+                        negMul = 1;
+                        head = i + 1;
+                        tempCnt = -1;
+                    }
+                    else if (cur == 45) { // ascii -
+                        negMul = -1;
+                    }
+                    else {
+                        scratch[tempCnt + 1] = cur;
+                        tempCnt++;
+                    }
+                }
+                else if (cur == 59) { // ;
+                    int len = i - head;
+
+                    // this is faster than filling scratch immediately after each byte is read
+                    mbuf.position(head);
+                    mbuf.get(scratch, 0, len);
+
+                    int hash = calc_hash(scratch, len);
+                    ma = hashSpace[hash];
+                    MeasurementAggregator prev = null;
+
+                    while (true) {
+                        if (ma == null) {
+                            ma = new MeasurementAggregator();
+                            ma.station = Arrays.copyOfRange(scratch, 0, len);
+                            staArr[numStations] = new String(scratch, 0, len, StandardCharsets.UTF_8);
+
+                            if (prev != null) {
+                                prev.next = ma;
+                            }
+                            else {
+                                hashSpace[hash] = ma;
+                            }
+
+                            numStations++;
+                            break;
+                        }
+                        else if ((len != ma.station.length) || (Arrays.compare(scratch, 0, len, ma.station, 0, len) != 0)) { // hash collision
+                            prev = ma;
+                            ma = ma.next;
+                        }
+                        else { // hit
+                            break;
+                        }
+                    }
+                    state = true;
+                    head = i + 1;
+                }
+            }
+            threadOut[threadID] = new ThreadCalcs();
+            threadOut[threadID].hashSpace = hashSpace;
+            threadOut[threadID].staArr = staArr;
+            threadOut[threadID].numStations = numStations;
+        }
+    }
+
+    public static void main(String[] args) {
+        try {
+            RandomAccessFile mraf = new RandomAccessFile(FILE, "r");
+            long fileSize = mraf.getChannel().size();
+            long threadNum = NUM_THREADS;
+
+            long minThreads = (fileSize / Integer.MAX_VALUE) + 1; // minimum # of threads required due to MappedByteBuffer size limit
+            if (threadNum < minThreads) {
+                threadNum = minThreads;
+            }
+            long bufSize = fileSize / threadNum;
+
+            // don't bother multithreading for small files
+            if (bufSize < 1000000) {
+                threadNum = 1;
+                bufSize = Integer.MAX_VALUE;
+            }
+
+            ThreadedParser[] myThreads = new ThreadedParser[(int) threadNum];
+            ThreadCalcs[] threadOut = new ThreadCalcs[(int) threadNum];
+            int threadID = 0;
+
             long h = 0;
             while (h < fileSize) {
                 long length = bufSize;
                 boolean finished = false;
-                if (h + length > fileSize) {
+
+                if ((h == 0) && (length + LEN_EXTEND < Integer.MAX_VALUE)) { // add a bit of extra bytes to first thread to avoid generating new thread for the remainder
+                    length += LEN_EXTEND; // arbitary bytes to guarantee a newline somewhere
+                }
+                if (h + length > fileSize) { // past the end
                     length = fileSize - h;
                     finished = true;
                 }
@@ -80,109 +206,69 @@ public static void main(String[] args) {
                     }
                 }
 
-                boolean state = false; // 0 for station pickup, 1 for measurement pickup
-                int negMul = 1;
-                int head = 0;
-                int tempCnt = -1; // 0 if 1 digit measurement, 1 if 2 digit
-
-                for (int i = 0; i < mbs; i++) {
-                    byte cur = mbuf.get(i);
-                    if (state == true) {
-                        if (cur == 46) { // .
-                            int tempa = mbuf.get(i + 1) - 48;
-                            tempa += (scratch[0] - 48) * (10 + 90 * tempCnt) + (scratch[1] - 48) * (10 * tempCnt); // branchless
-                            tempa *= negMul;
-
-                            if (tempa < ma.min) {
-                                ma.min = tempa;
-                            }
-                            if (tempa > ma.max) {
-                                ma.max = tempa;
-                            }
-                            ma.sum += tempa;
-                            ma.count++;
-
-                            i += 2; // go to start of new line
-                            state = false;
-                            negMul = 1;
-                            head = i + 1;
-                            tempCnt = -1;
-                        }
-                        else if (cur == 45) { // ascii -
-                            negMul = -1;
-                        }
-                        else {
-                            scratch[tempCnt + 1] = cur;
-                            tempCnt++;
-                        }
-                    }
-                    else if (cur == 59) { // ;
-                        int len = i - head;
-
-                        // this is faster than filling scratch immediately after each byte is read
-                        mbuf.position(head);
-                        mbuf.get(scratch, 0, len);
-
-                        int hash = calc_hash(scratch, len);
-                        ma = hashSpace[hash];
-                        MeasurementAggregator prev = null;
-
-                        while (true) {
-                            if (ma == null) {
-                                ma = new MeasurementAggregator();
-                                ma.station = Arrays.copyOfRange(scratch, 0, len);
-                                staArr[numStations] = new String(scratch, 0, len, StandardCharsets.UTF_8);
-
-                                if (prev != null) {
-                                    prev.next = ma;
-                                }
-                                else {
-                                    hashSpace[hash] = ma;
-                                }
-
-                                numStations++;
-                                break;
-                            }
-                            else if ((len != ma.station.length) || (Arrays.compare(scratch, 0, len, ma.station, 0, len) != 0)) { // hash collision
-                                prev = ma;
-                                ma = ma.next;
-                            }
-                            else { // hit
-                                break;
-                            }
-                        }
-                        state = true;
-                        head = i + 1;
-                    }
-                }
+                myThreads[threadID] = new ThreadedParser(mbuf, mbs, threadOut, threadID);
+                myThreads[threadID].start();
+
                 h += mbs;
+                threadID++;
             }
 
-            Arrays.sort(staArr, 0, numStations);
+            for (int i = 0; i < threadID; i++) {
+                try {
+                    myThreads[i].join();
+                }
+                catch (InterruptedException ex) {
+                    System.exit(1);
+                }
+            }
 
+            // use treemap to sort and uniquify
+            Map<String, Integer> staMap = new TreeMap<>();
+            for (int i = 0; i < threadID; i++) {
+                for (int j = 0; j < threadOut[i].numStations; j++) {
+                    staMap.put(threadOut[i].staArr[j], 0);
+                }
+            }
+
+            boolean started = false;
             String out = "{";
-            for (int i = 0; i < numStations; i++) {
-                byte[] strBuf = staArr[i].getBytes(StandardCharsets.UTF_8);
+            for (String i : staMap.keySet()) {
+                if (started) {
+                    out += ", ";
+                }
+                else {
+                    started = true;
+                }
+
+                byte[] strBuf = i.getBytes(StandardCharsets.UTF_8);
 
                 int hash = calc_hash(strBuf, strBuf.length);
-                ma = hashSpace[hash];
+                MeasurementAggregator mSum = new MeasurementAggregator();
+                for (int j = 0; j < threadID; j++) {
+                    MeasurementAggregator ma = threadOut[j].hashSpace[hash];
 
-                while (true) {
-                    if ((strBuf.length != ma.station.length) || (Arrays.compare(strBuf, ma.station) != 0)) { // hash collision
-                        ma = ma.next;
-                        continue;
-                    }
-                    else { // hit
-                        double min = Math.round(Double.valueOf(ma.min)) / 10.0;
-                        double avg = Math.round(Double.valueOf(ma.sum) / Double.valueOf(ma.count)) / 10.0;
-                        double max = Math.round(Double.valueOf(ma.max)) / 10.0;
-                        out += staArr[i] + "=" + min + "/" + avg + "/" + max;
-                        if (i != (numStations - 1)) {
-                            out += ", ";
+                    while (true) {
+                        if ((strBuf.length != ma.station.length) || (Arrays.compare(strBuf, ma.station) != 0)) { // hash collision
+                            ma = ma.next;
+                            continue;
+                        }
+                        else { // hit
+                            if (ma.min < mSum.min) {
+                                mSum.min = ma.min;
+                            }
+                            if (ma.max > mSum.max) {
+                                mSum.max = ma.max;
+                            }
+                            mSum.sum += ma.sum;
+                            mSum.count += ma.count;
+                            break;
                         }
-                        break;
                     }
                 }
+                double min = Math.round(Double.valueOf(mSum.min)) / 10.0;
+                double avg = Math.round(Double.valueOf(mSum.sum) / Double.valueOf(mSum.count)) / 10.0;
+                double max = Math.round(Double.valueOf(mSum.max)) / 10.0;
+                out += i + "=" + min + "/" + avg + "/" + max;
             }
             out += "}\n";
             System.out.print(out);

From 6c7012a43e92cc699042f63226be66f0ddbe8c89 Mon Sep 17 00:00:00 2001
From: Pratham <prathamd94@gmail.com>
Date: Mon, 15 Jan 2024 12:47:06 -0500
Subject: [PATCH 016/268] Add improvements (#412)

- custom hashmap
- avoid string creation
- use graal
---
 prepare_phd3.sh                               |  19 ++
 .../morling/onebrc/CalculateAverage_phd3.java | 172 ++++++++++++++----
 2 files changed, 154 insertions(+), 37 deletions(-)
 create mode 100755 prepare_phd3.sh

diff --git a/prepare_phd3.sh b/prepare_phd3.sh
new file mode 100755
index 000000000..f83a3ff69
--- /dev/null
+++ b/prepare_phd3.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java b/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java
index e3d1cdbef..97f832b30 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java
@@ -15,18 +15,24 @@
  */
 package dev.morling.onebrc;
 
-import static java.nio.charset.StandardCharsets.*;
 import static java.util.stream.Collectors.*;
 
 import java.io.File;
 import java.io.RandomAccessFile;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
 public class CalculateAverage_phd3 {
@@ -34,12 +40,16 @@ public class CalculateAverage_phd3 {
     private static final int NUM_THREADS = Runtime.getRuntime().availableProcessors() * 2;
     private static final String FILE = "./measurements.txt";
     private static final long FILE_SIZE = new File(FILE).length();
+    // A chunk is a unit for processing, the file will be divided in chunks of the following size
     private static final int CHUNK_SIZE = 65536 * 1024;
+    // Read a little more data into the buffer to finish processing current line
     private static final int PADDING = 512;
+    // Minor : Precompute powers to avoid recalculating while parsing doubles (temperatures)
     private static final double[] POWERS_OF_10 = IntStream.range(0, 6).mapToDouble(x -> Math.pow(10.0, x)).toArray();
 
-    private static final Map<String, AggregationInfo> globalMap = new ConcurrentHashMap<>();
-
+    /**
+     * A Utility to print aggregated information in the desired format
+     */
     private record ResultRow(double min, double mean, double max) {
 
         public String toString() {
@@ -52,7 +62,7 @@ private double round(double value) {
     };
 
     public static ResultRow resultRow(AggregationInfo aggregationInfo) {
-        return new ResultRow(aggregationInfo.min, aggregationInfo.sum / aggregationInfo.count, aggregationInfo.max);
+        return new ResultRow(aggregationInfo.min, (Math.round(aggregationInfo.sum * 10.0) / 10.0) / (aggregationInfo.count), aggregationInfo.max);
     }
 
     public static void main(String[] args) throws Exception {
@@ -60,19 +70,37 @@ public static void main(String[] args) throws Exception {
         int numChunks = (int) Math.ceil(fileLength * 1.0 / CHUNK_SIZE);
         ExecutorService executorService = Executors.newFixedThreadPool(NUM_THREADS);
         BufferDataProvider provider = new RandomAccessBasedProvider(FILE, FILE_SIZE);
+        List<Future<LinearProbingHashMap>> futures = new ArrayList<>();
+        // Process chunks in parallel
         for (int chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) {
-            executorService.submit(new Aggregator(chunkIndex, provider));
+            futures.add(executorService.submit(new Aggregator(chunkIndex, provider)));
         }
 
         executorService.shutdown();
         executorService.awaitTermination(10, TimeUnit.MINUTES);
 
-        Map<String, ResultRow> measurements = new TreeMap<>(globalMap.entrySet().stream()
+        Map<String, AggregationInfo> info = futures.stream().map(f -> {
+            try {
+                return f.get();
+            }
+            catch (ExecutionException | InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+        })
+                .map(LinearProbingHashMap::toMap)
+                .flatMap(map -> map.entrySet().stream())
+                .sequential()
+                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, AggregationInfo::update));
+
+        Map<String, ResultRow> measurements = new TreeMap<>(info.entrySet().stream()
                 .collect(toMap(Map.Entry::getKey, e -> resultRow(e.getValue()))));
 
         System.out.println(measurements);
     }
 
+    /**
+     * Stores required running aggregation information to be able to compute min/max/average at the end
+     */
     private static class AggregationInfo {
         double min = Double.POSITIVE_INFINITY;
         double max = Double.NEGATIVE_INFINITY;
@@ -108,13 +136,14 @@ private interface BufferDataProvider {
         int read(byte[] buffer, long offset) throws Exception;
     }
 
+    /**
+     * uses RandomAccessFile seek and read APIs to load data into a buffer.
+     */
     private static class RandomAccessBasedProvider implements BufferDataProvider {
         private final String filePath;
-        private final long fileSize;
 
         RandomAccessBasedProvider(String filePath, long fileSize) {
             this.filePath = filePath;
-            this.fileSize = fileSize;
         }
 
         @Override
@@ -133,7 +162,10 @@ public int read(byte[] buffer, long offset) throws Exception {
         }
     }
 
-    private static class Aggregator implements Runnable {
+    /**
+     * Task to processes a chunk of file and return a custom linear probing hashmap for performance
+     */
+    private static class Aggregator implements Callable<LinearProbingHashMap> {
         private final long startByte;
         private final BufferDataProvider dataProvider;
 
@@ -143,7 +175,7 @@ public Aggregator(long chunkIndex, BufferDataProvider dataProvider) {
         }
 
         @Override
-        public void run() {
+        public LinearProbingHashMap call() {
             try {
                 // offset for the last byte to be processed (excluded)
                 long endByte = Math.min(startByte + CHUNK_SIZE, FILE_SIZE);
@@ -151,25 +183,15 @@ public void run() {
                 long bufferSize = endByte - startByte + ((endByte == FILE_SIZE) ? 0 : PADDING);
                 byte[] buffer = new byte[(int) bufferSize];
                 int bytes = dataProvider.read(buffer, startByte);
-                // Partial aggregation to avoid accessing global concurrent map for every entry
-                Map<String, AggregationInfo> updated = processBuffer(
-                        buffer, startByte == 0, endByte - startByte);
-                // Full aggregation with global map
-                updated.entrySet().forEach(entry -> {
-                    globalMap.compute(entry.getKey(), (k, v) -> {
-                        if (v == null) {
-                            return entry.getValue();
-                        }
-                        return v.update(entry.getValue());
-                    });
-                });
+                // Partial aggregation in a hashmap
+                return processBuffer(buffer, startByte == 0, endByte - startByte);
             }
             catch (Throwable e) {
                 throw new RuntimeException(e);
             }
         }
 
-        private static Map<String, AggregationInfo> processBuffer(byte[] buffer, boolean isFileStart, long nextChunkStart) {
+        private static LinearProbingHashMap processBuffer(byte[] buffer, boolean isFileStart, long nextChunkStart) {
             int start = 0;
             // Move to the next entry after '\n'. Don't do this if we're at the start of
             // the file to avoid missing first entry.
@@ -180,13 +202,15 @@ private static Map<String, AggregationInfo> processBuffer(byte[] buffer, boolean
                 start += 1;
             }
 
-            // local map for this thread, don't need thread safety
-            Map<String, AggregationInfo> chunkMap = new HashMap<>();
+            LinearProbingHashMap chunkLocalMap = new LinearProbingHashMap();
             while (true) {
                 LineInfo lineInfo = getNextLine(buffer, start);
-                String key = new String(buffer, start, lineInfo.semicolonIndex - start);
+                byte[] keyBytes = new byte[lineInfo.semicolonIndex - start];
+                System.arraycopy(buffer, start, keyBytes, 0, keyBytes.length);
                 double value = parseDouble(buffer, lineInfo.semicolonIndex + 1, lineInfo.nextStart - 1);
-                update(chunkMap, key, value);
+                // Update aggregated value for the given key with the new line
+                AggregationInfo info = chunkLocalMap.get(keyBytes, lineInfo.keyHash);
+                info.update(value);
 
                 if ((lineInfo.nextStart > nextChunkStart) || (lineInfo.nextStart >= buffer.length)) {
                     // we are already at a point where the next line will be processed in the next chunk,
@@ -196,9 +220,12 @@ private static Map<String, AggregationInfo> processBuffer(byte[] buffer, boolean
 
                 start = lineInfo.nextStart();
             }
-            return chunkMap;
+            return chunkLocalMap;
         }
 
+        /**
+         * Converts bytes to double value without intermediate string conversion, faster than Double.parseDouble.
+         */
         private static double parseDouble(byte[] bytes, int offset, int end) {
             boolean negative = (bytes[offset] == '-');
             int current = negative ? offset + 1 : offset;
@@ -216,26 +243,97 @@ private static double parseDouble(byte[] bytes, int offset, int end) {
             return (preFloat + ((postFloat) / POWERS_OF_10[end - postFloatStart])) * (negative ? -1 : 1);
         }
 
-        private static void update(Map<String, AggregationInfo> state, String key, double value) {
-            AggregationInfo info = state.computeIfAbsent(key, k -> new AggregationInfo());
-            info.update(value);
-        }
-
-        // identifies indexes of the next ';' and '\n', which will be used to get entry key and value from line
+        /**
+         * Identifies indexes of the next ';' and '\n', which will be used to get entry key and value from line. Also
+         * computes the hash value for the key while iterating.
+         */
         private static LineInfo getNextLine(byte[] buffer, int start) {
             // caller guarantees that the access is in bounds, so no index check
+            int hash = 0;
             while (buffer[start] != ';') {
                 start++;
+                hash = hash * 31 + buffer[start];
             }
+            // The following is just to further reduce the probability of collisions
+            hash = hash ^ (hash << 16);
             int semicolonIndex = start;
             // caller guarantees that the access is in bounds, so no index check
             while (buffer[start] != '\n') {
                 start++;
             }
-            return new LineInfo(semicolonIndex, start + 1);
+            return new LineInfo(semicolonIndex, start + 1, hash);
+        }
+    }
+
+    private record LineInfo(int semicolonIndex, int nextStart, int keyHash) {
+    }
+
+    /**
+     * A simple map with pre-configured fixed bucket count. With 2^13 buckets and current hash function, seeing 4
+     * collisions which is not too bad. Every bucket is implemented with a linked list. The map is NOT thread safe.
+     */
+    private static class LinearProbingHashMap {
+        private final static int BUCKET_COUNT = 8191;
+        private final Node[] buckets;
+
+        LinearProbingHashMap() {
+            this.buckets = new Node[BUCKET_COUNT];
+        }
+
+        /**
+         * Given a key, returns the current value of AggregationInfo. If not present, creates a new empty node at the
+         * front of the bucket
+         */
+        public AggregationInfo get(byte[] key, int keyHash) {
+            // find bucket index through bitwise AND, works for bucketCount = (2^p - 1)
+            int bucketIndex = BUCKET_COUNT & keyHash;
+            Node current = buckets[bucketIndex];
+            while (current != null) {
+                if (Arrays.equals(current.entry.key(), key)) {
+                    return current.entry.aggregationInfo();
+                }
+                current = current.next;
+            }
+
+            // Entry does not exist, so add a new node in the linked list
+            AggregationInfo newInfo = new AggregationInfo();
+            KeyValuePair pair = new KeyValuePair(key, keyHash, newInfo);
+            Node newNode = new Node(pair, buckets[bucketIndex]);
+            buckets[bucketIndex] = newNode;
+            return newNode.entry.aggregationInfo();
+        }
+
+        /**
+         * A helper to convert to Java's hash map to build the final aggregation after partial aggregations
+         */
+        private Map<String, AggregationInfo> toMap() {
+            Map<String, AggregationInfo> map = new HashMap<>();
+            for (Node bucket : buckets) {
+                while (bucket != null) {
+                    map.put(new String(bucket.entry.key, StandardCharsets.UTF_8), bucket.entry.aggregationInfo());
+                    bucket = bucket.next;
+                }
+            }
+            return map;
+        }
+    }
+
+    /**
+     * Linked List node to implement a bucket of custom hash map
+     */
+    private static class Node {
+        KeyValuePair entry;
+        Node next;
+
+        public Node(KeyValuePair entry, Node next) {
+            this.entry = entry;
+            this.next = next;
         }
     }
 
-    private record LineInfo(int semicolonIndex, int nextStart) {
+    /**
+     * a wrapper class to store information needed for storing a measurement information in the hashmap
+     */
+    private record KeyValuePair(byte[] key, int keyHash, AggregationInfo aggregationInfo) {
     }
 }

From ecab306338e802cfc3fc808a4b87bc5b0a179b13 Mon Sep 17 00:00:00 2001
From: Marko Topolnik <marko.topolnik@gmail.com>
Date: Mon, 15 Jan 2024 18:49:32 +0100
Subject: [PATCH 017/268] 10k improvement (#419)

* Remove commented-out params from the script

* General cleanup and refactoring

* Deoptimize parseTemperatureSimple

* Optimize nameEquals()
---
 calculate_average_mtopolnik.sh                |   2 -
 .../onebrc/CalculateAverage_mtopolnik.java    | 191 ++++++++++--------
 2 files changed, 110 insertions(+), 83 deletions(-)

diff --git a/calculate_average_mtopolnik.sh b/calculate_average_mtopolnik.sh
index e48711a19..24b5a1cb4 100755
--- a/calculate_average_mtopolnik.sh
+++ b/calculate_average_mtopolnik.sh
@@ -15,7 +15,5 @@
 #  limitations under the License.
 #
 
-# -XX:+UnlockDiagnosticVMOptions -XX:PrintAssemblyOptions=intel -XX:CompileCommand=print,*.CalculateAverage_mtopolnik::recordMeasurementAndAdvanceCursor"
-# -XX:InlineSmallCode=10000 -XX:-TieredCompilation -XX:CICompilerCount=2 -XX:CompileThreshold=1000\
 java --enable-preview \
   --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_mtopolnik
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java b/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java
index fe487fcda..51ea41516 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java
@@ -155,39 +155,52 @@ public void run() {
             }
         }
 
+        private static final int MAX_TEMPERATURE_LEN = 5;
+        private static final int MAX_ROW_LEN = MAX_NAME_LEN + 1 + MAX_TEMPERATURE_LEN + 1;
+        private static final long DANGER_ZONE_LENGTH = ((MAX_ROW_LEN - 1) / 8 * 8 + 8);
+
         private void processChunk() {
             while (cursor < inputSize) {
+                boolean withinSafeZone;
                 long word1;
                 long word2;
-                if (cursor + 2 * Long.BYTES <= inputSize) {
-                    word1 = UNSAFE.getLong(inputBase + cursor);
-                    word2 = UNSAFE.getLong(inputBase + cursor + Long.BYTES);
+                long nameLen;
+                long nameStartAddress = inputBase + cursor;
+                if (cursor + DANGER_ZONE_LENGTH <= inputSize) {
+                    withinSafeZone = true;
+                    word1 = UNSAFE.getLong(nameStartAddress);
+                    word2 = UNSAFE.getLong(nameStartAddress + Long.BYTES);
+                    nameLen = nameLen(word1, word2, withinSafeZone);
+                    word1 = maskWord(word1, nameLen);
+                    word2 = maskWord(word2, nameLen - Long.BYTES);
                 }
                 else {
+                    withinSafeZone = false;
                     UNSAFE.putLong(nameBufBase, 0);
                     UNSAFE.putLong(nameBufBase + Long.BYTES, 0);
-                    UNSAFE.copyMemory(inputBase + cursor, nameBufBase, Long.min(NAMEBUF_SIZE, inputSize - cursor));
+                    UNSAFE.copyMemory(nameStartAddress, nameBufBase, Long.min(NAMEBUF_SIZE, inputSize - cursor));
                     word1 = UNSAFE.getLong(nameBufBase);
                     word2 = UNSAFE.getLong(nameBufBase + Long.BYTES);
+                    nameLen = nameLen(word1, word2, withinSafeZone);
                 }
-                long posOfSemicolon = posOfSemicolon(word1, word2);
-                word1 = maskWord(word1, posOfSemicolon - cursor);
-                word2 = maskWord(word2, posOfSemicolon - cursor - Long.BYTES);
                 long hash = hash(word1);
-                long namePos = cursor;
-                long nameLen = posOfSemicolon - cursor;
-                assert nameLen <= 100 : "nameLen > 100";
-                int temperature = parseTemperatureAndAdvanceCursor(posOfSemicolon);
-                updateStats(hash, namePos, nameLen, word1, word2, temperature);
+                assert nameLen > 0 && nameLen <= 100 : nameLen;
+                long tempStartAddress = nameStartAddress + nameLen + 1;
+                int temperature = withinSafeZone
+                        ? parseTemperatureSwarAndAdvanceCursor(tempStartAddress)
+                        : parseTemperatureSimpleAndAdvanceCursor(tempStartAddress);
+                updateStats(hash, nameStartAddress, nameLen, word1, word2, temperature, withinSafeZone);
             }
         }
 
-        private void updateStats(long hash, long namePos, long nameLen, long nameWord1, long nameWord2, int temperature) {
+        private void updateStats(
+                                 long hash, long nameStartAddress, long nameLen, long nameWord1, long nameWord2,
+                                 int temperature, boolean withinSafeZone) {
             int tableIndex = (int) (hash & TABLE_INDEX_MASK);
             while (true) {
                 stats.gotoIndex(tableIndex);
-                if (stats.hash() == hash && stats.nameLen() == nameLen
-                        && nameEquals(stats.nameAddress(), inputBase + namePos, nameLen, nameWord1, nameWord2)) {
+                if (stats.hash() == hash && stats.nameLen() == nameLen && nameEquals(
+                        stats.nameAddress(), nameStartAddress, nameLen, nameWord1, nameWord2, withinSafeZone)) {
                     stats.setSum(stats.sum() + temperature);
                     stats.setCount(stats.count() + 1);
                     stats.setMin((short) Integer.min(stats.min(), temperature));
@@ -204,72 +217,58 @@ && nameEquals(stats.nameAddress(), inputBase + namePos, nameLen, nameWord1, name
                 stats.setCount(1);
                 stats.setMin((short) temperature);
                 stats.setMax((short) temperature);
-                UNSAFE.copyMemory(inputBase + namePos, stats.nameAddress(), nameLen);
+                UNSAFE.copyMemory(nameStartAddress, stats.nameAddress(), nameLen);
                 return;
             }
         }
 
-        private int parseTemperatureAndAdvanceCursor(long semicolonPos) {
-            long startOffset = semicolonPos + 1;
-            if (startOffset <= inputSize - Long.BYTES) {
-                return parseTemperatureSwarAndAdvanceCursor(startOffset);
-            }
-            return parseTemperatureSimpleAndAdvanceCursor(startOffset);
-        }
-
         // Credit: merykitty
-        private int parseTemperatureSwarAndAdvanceCursor(long startOffset) {
-            long word = UNSAFE.getLong(inputBase + startOffset);
+        private int parseTemperatureSwarAndAdvanceCursor(long tempStartAddress) {
+            long word = UNSAFE.getLong(tempStartAddress);
             final long negated = ~word;
             final int dotPos = Long.numberOfTrailingZeros(negated & 0x10101000);
+            cursor = (tempStartAddress + (dotPos / 8) + 3) - inputBase;
             final long signed = (negated << 59) >> 63;
             final long removeSignMask = ~(signed & 0xFF);
             final long digits = ((word & removeSignMask) << (28 - dotPos)) & 0x0F000F0F00L;
             final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-            final int temperature = (int) ((absValue ^ signed) - signed);
-            cursor = startOffset + (dotPos / 8) + 3;
-            return temperature;
+            return (int) ((absValue ^ signed) - signed);
         }
 
-        private int parseTemperatureSimpleAndAdvanceCursor(long startOffset) {
+        private int parseTemperatureSimpleAndAdvanceCursor(long tempStartAddress) {
             final byte minus = (byte) '-';
             final byte zero = (byte) '0';
             final byte dot = (byte) '.';
 
-            // Temperature plus the following newline is at least 4 chars, so this is always safe:
-            int fourCh = UNSAFE.getInt(inputBase + startOffset);
-            final int mask = 0xFF;
-            byte ch = (byte) (fourCh & mask);
-            int shift = 0;
+            byte ch = UNSAFE.getByte(tempStartAddress);
+            long address = tempStartAddress;
             int temperature;
             int sign;
             if (ch == minus) {
                 sign = -1;
-                shift += 8;
-                ch = (byte) ((fourCh & (mask << shift)) >>> shift);
+                address++;
+                ch = UNSAFE.getByte(address);
             }
             else {
                 sign = 1;
             }
             temperature = ch - zero;
-            shift += 8;
-            ch = (byte) ((fourCh & (mask << shift)) >>> shift);
+            address++;
+            ch = UNSAFE.getByte(address);
             if (ch == dot) {
-                shift += 8;
-                ch = (byte) ((fourCh & (mask << shift)) >>> shift);
+                address++;
+                ch = UNSAFE.getByte(address);
             }
             else {
                 temperature = 10 * temperature + (ch - zero);
-                shift += 16;
-                // The last character may be past the four loaded bytes, load it from memory.
-                // Checking that with another `if` is self-defeating for performance.
-                ch = UNSAFE.getByte(inputBase + startOffset + (shift / 8));
+                address += 2;
+                ch = UNSAFE.getByte(address);
             }
             temperature = 10 * temperature + (ch - zero);
-            // `shift` holds the number of bits in the temperature field.
+            // address - inputBase is the length of the temperature field.
             // A newline character follows the temperature, and so we advance
             // the cursor past the newline to the start of the next line.
-            cursor = startOffset + (shift / 8) + 2;
+            cursor = (address + 2) - inputBase;
             return sign * temperature;
         }
 
@@ -286,15 +285,27 @@ private static long hash(long word1) {
             return hash;
         }
 
-        private static boolean nameEquals(long statsAddr, long inputAddr, long len, long inputWord1, long inputWord2) {
+        private static boolean nameEquals(long statsAddr, long inputAddr, long len, long inputWord1, long inputWord2,
+                                          boolean withinSafeZone) {
             boolean mismatch1 = maskWord(inputWord1, len) != UNSAFE.getLong(statsAddr);
             boolean mismatch2 = maskWord(inputWord2, len - Long.BYTES) != UNSAFE.getLong(statsAddr + Long.BYTES);
-            if (mismatch1 | mismatch2) {
-                return false;
+            if (len <= 2 * Long.BYTES) {
+                return !(mismatch1 | mismatch2);
             }
-            for (int i = 2 * Long.BYTES; i < len; i++) {
-                if (UNSAFE.getByte(inputAddr + i) != UNSAFE.getByte(statsAddr + i)) {
-                    return false;
+            if (withinSafeZone) {
+                int i = 2 * Long.BYTES;
+                for (; i <= len - Long.BYTES; i += Long.BYTES) {
+                    if (UNSAFE.getLong(inputAddr + i) != UNSAFE.getLong(statsAddr + i)) {
+                        return false;
+                    }
+                }
+                return maskWord(UNSAFE.getLong(inputAddr + i), len - i) == UNSAFE.getLong(statsAddr + i);
+            }
+            else {
+                for (int i = 2 * Long.BYTES; i < len; i++) {
+                    if (UNSAFE.getByte(inputAddr + i) != UNSAFE.getByte(statsAddr + i)) {
+                        return false;
+                    }
                 }
             }
             return true;
@@ -311,44 +322,62 @@ private static long maskWord(long word, long len) {
 
         // Adapted from https://jameshfisher.com/2017/01/24/bitwise-check-for-zero-byte/
         // and https://github.com/ashvardanian/StringZilla/blob/14e7a78edcc16b031c06b375aac1f66d8f19d45a/stringzilla/stringzilla.h#L139-L169
-        long posOfSemicolon(long word1, long word2) {
-            long diff = word1 ^ BROADCAST_SEMICOLON;
-            long matchBits1 = (diff - BROADCAST_0x01) & ~diff & BROADCAST_0x80;
-            diff = word2 ^ BROADCAST_SEMICOLON;
-            long matchBits2 = (diff - BROADCAST_0x01) & ~diff & BROADCAST_0x80;
-            if ((matchBits1 | matchBits2) != 0) {
-                int trailing1 = Long.numberOfTrailingZeros(matchBits1);
-                int match1IsNonZero = trailing1 & 63;
-                match1IsNonZero |= match1IsNonZero >>> 3;
-                match1IsNonZero |= match1IsNonZero >>> 1;
-                match1IsNonZero |= match1IsNonZero >>> 1;
-                // Now match1IsNonZero is 1 if it's non-zero, else 0. Use it to
-                // raise the lowest bit in traling2 if trailing1 is nonzero. This forces
-                // trailing2 to be zero if trailing1 is non-zero.
-                int trailing2 = Long.numberOfTrailingZeros(matchBits2 | match1IsNonZero) & 63;
-                return cursor + ((trailing1 | trailing2) >> 3);
+        long nameLen(long word1, long word2, boolean withinSafeZone) {
+            {
+                long matchBits1 = matchBits(word1);
+                long matchBits2 = matchBits(word2);
+                if ((matchBits1 | matchBits2) != 0) {
+                    int trailing1 = Long.numberOfTrailingZeros(matchBits1);
+                    int match1IsNonZero = trailing1 & 63;
+                    match1IsNonZero |= match1IsNonZero >>> 3;
+                    match1IsNonZero |= match1IsNonZero >>> 1;
+                    match1IsNonZero |= match1IsNonZero >>> 1;
+                    // Now match1IsNonZero is 1 if it's non-zero, else 0. Use it to
+                    // raise the lowest bit in trailing2 if trailing1 is nonzero. This forces
+                    // trailing2 to be zero if trailing1 is non-zero.
+                    int trailing2 = Long.numberOfTrailingZeros(matchBits2 | match1IsNonZero) & 63;
+                    // trailing1 | trailing2 works like trailing1 + trailing2 because if trailing2 is non-zero,
+                    // then trailing1 is 64, and since trailing2 is < 64, there's no bit overlap.
+                    return (trailing1 | trailing2) >> 3;
+                }
             }
-            long offset = cursor + 2 * Long.BYTES;
-            for (; offset <= inputSize - Long.BYTES; offset += Long.BYTES) {
-                var block = UNSAFE.getLong(inputBase + offset);
-                diff = block ^ BROADCAST_SEMICOLON;
-                long matchBits = (diff - BROADCAST_0x01) & ~diff & BROADCAST_0x80;
-                if (matchBits != 0) {
-                    return offset + Long.numberOfTrailingZeros(matchBits) / 8;
+            long nameStartAddress = inputBase + cursor;
+            long address = nameStartAddress + 2 * Long.BYTES;
+            long limit = inputBase + inputSize;
+            if (withinSafeZone) {
+                for (; address < limit; address += Long.BYTES) {
+                    var block = maskWord(UNSAFE.getLong(address), limit - address);
+                    long matchBits = matchBits(block);
+                    if (matchBits != 0) {
+                        return address + (Long.numberOfTrailingZeros(matchBits) >> 3) - nameStartAddress;
+                    }
                 }
+                throw new RuntimeException("Semicolon not found");
             }
-            return posOfSemicolonSimple(offset);
+            return addrOfSemicolonSafe(address, limit) - nameStartAddress;
         }
 
-        private long posOfSemicolonSimple(long offset) {
-            for (; offset < inputSize; offset++) {
-                if (UNSAFE.getByte(inputBase + offset) == SEMICOLON) {
-                    return offset;
+        private static long addrOfSemicolonSafe(long address, long limit) {
+            for (; address < limit - Long.BYTES + 1; address += Long.BYTES) {
+                var block = UNSAFE.getLong(address);
+                long matchBits = matchBits(block);
+                if (matchBits != 0) {
+                    return address + (Long.numberOfTrailingZeros(matchBits) >> 3);
+                }
+            }
+            for (; address < limit; address++) {
+                if (UNSAFE.getByte(address) == SEMICOLON) {
+                    return address;
                 }
             }
             throw new RuntimeException("Semicolon not found");
         }
 
+        private static long matchBits(long word) {
+            long diff = word ^ BROADCAST_SEMICOLON;
+            return (diff - BROADCAST_0x01) & ~diff & BROADCAST_0x80;
+        }
+
         // Copies the results from native memory to Java heap and puts them into the results array.
         private void exportResults() {
             var exportedStats = new ArrayList<StationStats>(10_000);

From 677d94e5cf9769d02843edf06686abfea7112d1d Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Tue, 16 Jan 2024 02:53:31 +0900
Subject: [PATCH 018/268] Optimized with less constructor args + low collision
 mixer (#420)

* use all CPUs

* use graal

* optimized with less constructor arg

* optimized with low collision mixer
---
 .../onebrc/CalculateAverage_abeobk.java       | 169 ++++++++++--------
 1 file changed, 97 insertions(+), 72 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index 1a71349b3..34a5552a5 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -28,6 +28,8 @@
 import sun.misc.Unsafe;
 
 public class CalculateAverage_abeobk {
+    private static final boolean SHOW_COLLISIONS = false;
+
     private static final String FILE = "./measurements.txt";
     private static final int BUCKET_SIZE = 1 << 16;
     private static final int BUCKET_MASK = BUCKET_SIZE - 1;
@@ -55,69 +57,55 @@ private static Unsafe initUnsafe() {
         }
     }
 
-    // stat
-    private static class Stat {
-        private int min;
-        private int max;
-        private long sum;
-        private int count;
+    static class Node {
+        long addr;
+        long tail;
+        int min, max;
+        int count;
+        long sum;
+
+        String key() {
+            byte[] sbuf = new byte[MAX_STR_LEN];
+            int keylen = (int) (tail >>> 56);
+            UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
+            return new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
+        }
+
+        public String toString() {
+            return String.format("%.1f/%.1f/%.1f", min * 0.1, sum * 0.1 / count, max * 0.1);
+        }
 
-        Stat(int v) {
-            sum = min = max = v;
+        Node(long a, long t, int val) {
+            addr = a;
+            tail = t;
+            sum = min = max = val;
             count = 1;
         }
 
         void add(int val) {
-            min = Math.min(val, min);
-            max = Math.max(val, max);
+            min = Math.min(min, val);
+            max = Math.max(max, val);
             sum += val;
             count++;
         }
 
-        void merge(Stat other) {
-            min = Math.min(other.min, min);
-            max = Math.max(other.max, max);
+        void merge(Node other) {
+            min = Math.min(min, other.min);
+            max = Math.max(max, other.max);
             sum += other.sum;
             count += other.count;
         }
 
-        public String toString() {
-            return String.format("%.1f/%.1f/%.1f", min * 0.1, sum * 0.1 / count, max * 0.1);
-        }
-    }
-
-    static class Node {
-        long addr;
-        int keylen;
-        int hash;
-        long[] buf = new long[13];
-        Stat stat;
-
-        String key() {
-            byte[] buf = new byte[MAX_STR_LEN];
-            UNSAFE.copyMemory(null, addr, buf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
-            return new String(buf, 0, keylen, StandardCharsets.UTF_8);
-        }
-
-        Node(long a, int kl, int h, int v, long[] b) {
-            stat = new Stat(v);
-            addr = a;
-            keylen = kl;
-            hash = h;
-            System.arraycopy(b, 0, buf, 0, Math.ceilDiv(kl, 8));
-        }
-
-        boolean contentEquals(final long[] other_buf) {
-            int k = keylen / 8;
-            int r = keylen % 8;
-            // Since the city name is most likely shorter than 16 characters
-            // this should be faster than typical conditional checks
-            long sum = 0;
-            for (int i = 0; i < k; i++) {
-                sum += buf[i] ^ other_buf[i];
+        boolean contentEquals(long other_addr, long other_tail) {
+            if (tail != other_tail) // compare tail & length at the same time
+                return false;
+            long my_addr = addr;
+            int nl = (int) (tail >> 59);
+            for (int i = 0; i < nl; i++, my_addr += 8, other_addr += 8) {
+                if (UNSAFE.getLong(my_addr) != UNSAFE.getLong(other_addr))
+                    return false;
             }
-            sum += (buf[k] ^ other_buf[k]) & HASH_MASKS[r];
-            return sum == 0;
+            return true;
         }
     }
 
@@ -135,55 +123,83 @@ static long[] slice(long start_addr, long end_addr, long chunk_size, int cpu_cnt
         return ptrs;
     }
 
+    static final long getSemiPosCode(final long word) {
+        long xor_semi = word ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
+        return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
+    }
+
+    // very low collision mixer
+    // idea from https://github.com/Cyan4973/xxHash/tree/dev
+    // zero collision on test data
+    static final int xxh32(long hash) {
+        final int p1 = 0x85EBCA77; // prime
+        final int p2 = 0xC2B2AE3D; // prime
+        int low = (int) hash;
+        int high = (int) (hash >>> 32);
+        low ^= low >> 15;
+        low *= p1;
+        high ^= high >> 13;
+        high *= p2;
+        var h = low ^ high;
+        return h;
+    }
+
     public static void main(String[] args) throws InterruptedException, IOException {
-        int cpu_cnt = Runtime.getRuntime().availableProcessors();
         try (var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
             long start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
             long file_size = file.size();
             long end_addr = start_addr + file_size;
+
+            // only use all cpus on large file
+            int cpu_cnt = file_size < 1e6 ? 1 : Runtime.getRuntime().availableProcessors();
             long chunk_size = Math.ceilDiv(file_size, cpu_cnt);
 
             // processing
             var threads = new Thread[cpu_cnt];
             var maps = new Node[cpu_cnt][];
             var ptrs = slice(start_addr, end_addr, chunk_size, cpu_cnt);
+            int[] cls = new int[cpu_cnt];
 
             for (int i = 0; i < cpu_cnt; i++) {
                 int thread_id = i;
                 long start = ptrs[i];
                 long end = ptrs[i + 1];
-                maps[i] = new Node[BUCKET_SIZE + 16]; // extra space for collisions
+                maps[i] = new Node[BUCKET_SIZE + 10000]; // extra space for collisions
 
                 (threads[i] = new Thread(() -> {
                     long addr = start;
                     var map = maps[thread_id];
-                    long[] buf = new long[13];
                     // parse loop
                     while (addr < end) {
-                        int idx = 0;
                         long hash = 0;
                         long word = 0;
                         long row_addr = addr;
                         int semi_pos = 8;
-                        while (semi_pos == 8) {
+                        word = UNSAFE.getLong(addr);
+                        long semipos_code = getSemiPosCode(word);
+
+                        while (semipos_code == 0) {
+                            hash ^= word;
+                            addr += 8;
                             word = UNSAFE.getLong(addr);
-                            buf[idx++] = word;
-                            // idea from thomaswue & royvanrijn
-                            long xor_semi = word ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
-                            long semipos_code = (xor_semi - 0x0101010101010101L) & ~xor_semi & 0x8080808080808080L;
-                            semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                            addr += semi_pos;
-                            hash ^= word & HASH_MASKS[semi_pos];
+                            semipos_code = getSemiPosCode(word);
                         }
 
-                        int hash32 = (int) (hash ^ (hash >>> 31));
-                        int keylen = (int) (addr - row_addr);
+                        semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                        long tail = word & HASH_MASKS[semi_pos];
+                        hash ^= tail;
+                        addr += semi_pos;
+
+                        int hash32 = xxh32(hash);
+                        long keylen = (addr - row_addr);
+                        tail = tail | (keylen << 56);
+
+                        addr++;
 
                         // great idea from merykitty (Quan Anh Mai)
-                        long num_word = UNSAFE.getLong(++addr);
+                        long num_word = UNSAFE.getLong(addr);
                         int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                         addr += (dot_pos >>> 3) + 3;
-
                         int shift = 28 - dot_pos;
                         long signed = (~num_word << 59) >> 63;
                         long dsmask = ~(signed & 0xFF);
@@ -195,14 +211,16 @@ public static void main(String[] args) throws InterruptedException, IOException
                         while (true) {
                             var node = map[bucket];
                             if (node == null) {
-                                map[bucket] = new Node(row_addr, keylen, hash32, val, buf);
+                                map[bucket] = new Node(row_addr, tail, val);
                                 break;
                             }
-                            if (node.keylen == keylen && node.hash == hash32 && node.contentEquals(buf)) {
-                                node.stat.add(val);
+                            if (node.contentEquals(row_addr, tail)) {
+                                node.add(val);
                                 break;
                             }
                             bucket++;
+                            if (SHOW_COLLISIONS)
+                                cls[thread_id]++;
                         }
                     }
                 })).start();
@@ -212,19 +230,26 @@ public static void main(String[] args) throws InterruptedException, IOException
             for (var thread : threads)
                 thread.join();
 
+            if (SHOW_COLLISIONS) {
+                for (int i = 0; i < cpu_cnt; i++) {
+                    System.out.println("thread-" + i + " collision = " + cls[i]);
+                }
+            }
+
             // collect results
-            TreeMap<String, Stat> ms = new TreeMap<>();
+            TreeMap<String, Node> ms = new TreeMap<>();
             for (var map : maps) {
                 for (var node : map) {
                     if (node == null)
                         continue;
-                    var stat = ms.putIfAbsent(node.key(), node.stat);
+                    var stat = ms.putIfAbsent(node.key(), node);
                     if (stat != null)
-                        stat.merge(node.stat);
+                        stat.merge(node);
                 }
             }
 
-            System.out.println(ms);
+            if (!SHOW_COLLISIONS)
+                System.out.println(ms);
         }
     }
 }
\ No newline at end of file

From dbdd89a84779761ca092e5aaeb6f6e92394a422d Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Mon, 15 Jan 2024 18:55:22 +0100
Subject: [PATCH 019/268] jerrinot's initial submission (#424)

* initial version

let's exploit that superscalar beauty!

* give credits where credits is due

also: added ideas I don't want to forget
---
 calculate_average_jerrinot.sh                 |  21 +
 prepare_jerrinot.sh                           |  19 +
 .../onebrc/CalculateAverage_jerrinot.java     | 482 ++++++++++++++++++
 3 files changed, 522 insertions(+)
 create mode 100755 calculate_average_jerrinot.sh
 create mode 100755 prepare_jerrinot.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java

diff --git a/calculate_average_jerrinot.sh b/calculate_average_jerrinot.sh
new file mode 100755
index 000000000..1bbf680fc
--- /dev/null
+++ b/calculate_average_jerrinot.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# -XX:+UnlockDiagnosticVMOptions -XX:PrintAssemblyOptions=intel -XX:CompileCommand=print,*.CalculateAverage_mtopolnik::recordMeasurementAndAdvanceCursor"
+# -XX:InlineSmallCode=10000 -XX:-TieredCompilation -XX:CICompilerCount=2 -XX:CompileThreshold=1000\
+java --enable-preview \
+  --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jerrinot
diff --git a/prepare_jerrinot.sh b/prepare_jerrinot.sh
new file mode 100755
index 000000000..f83a3ff69
--- /dev/null
+++ b/prepare_jerrinot.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
new file mode 100644
index 000000000..6fb89bb67
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -0,0 +1,482 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel.MapMode;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class CalculateAverage_jerrinot {
+    private static final Unsafe UNSAFE = unsafe();
+    private static final String MEASUREMENTS_TXT = "measurements.txt";
+    // todo: with hyper-threading enable we would be better of with availableProcessors / 2;
+    // todo: validate the testing env. params.
+    private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
+    private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
+
+    private static Unsafe unsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        calculate();
+    }
+
+    static void calculate() throws Exception {
+        final File file = new File(MEASUREMENTS_TXT);
+        final long length = file.length();
+        // final int chunkCount = Runtime.getRuntime().availableProcessors();
+        int chunkPerThread = 4;
+        final int chunkCount = THREAD_COUNT * chunkPerThread;
+        final var chunkStartOffsets = new long[chunkCount + 1];
+        try (var raf = new RandomAccessFile(file, "r")) {
+            // credit - chunking code: mtopolnik
+            final var inputBase = raf.getChannel().map(MapMode.READ_ONLY, 0, length, Arena.global()).address();
+            for (int i = 1; i < chunkStartOffsets.length - 1; i++) {
+                var start = length * i / (chunkStartOffsets.length - 1);
+                raf.seek(start);
+                while (raf.read() != (byte) '\n') {
+                }
+                start = raf.getFilePointer();
+                chunkStartOffsets[i] = start + inputBase;
+            }
+            chunkStartOffsets[0] = inputBase;
+            chunkStartOffsets[chunkCount] = inputBase + length;
+
+            Processor[] processors = new Processor[THREAD_COUNT];
+            Thread[] threads = new Thread[THREAD_COUNT];
+
+            for (int i = 0; i < THREAD_COUNT; i++) {
+                long startA = chunkStartOffsets[i * chunkPerThread];
+                long endA = chunkStartOffsets[i * chunkPerThread + 1];
+                long startB = chunkStartOffsets[i * chunkPerThread + 1];
+                long endB = chunkStartOffsets[i * chunkPerThread + 2];
+                long startC = chunkStartOffsets[i * chunkPerThread + 2];
+                long endC = chunkStartOffsets[i * chunkPerThread + 3];
+                long startD = chunkStartOffsets[i * chunkPerThread + 3];
+                long endD = chunkStartOffsets[i * chunkPerThread + 4];
+
+                Processor processor = new Processor(startA, endA, startB, endB, startC, endC, startD, endD);
+                processors[i] = processor;
+                Thread thread = new Thread(processor);
+                threads[i] = thread;
+                thread.start();
+            }
+
+            var accumulator = new TreeMap<String, Processor.StationStats>();
+            for (int i = 0; i < THREAD_COUNT; i++) {
+                Thread t = threads[i];
+                t.join();
+                processors[i].accumulateStatus(accumulator);
+            }
+
+            var sb = new StringBuilder();
+            boolean first = true;
+            for (Map.Entry<String, Processor.StationStats> statsEntry : accumulator.entrySet()) {
+                if (first) {
+                    sb.append("{");
+                    first = false;
+                }
+                else {
+                    sb.append(", ");
+                }
+                var value = statsEntry.getValue();
+                var name = statsEntry.getKey();
+                int min = value.min;
+                int max = value.max;
+                int count = value.count;
+                long sum2 = value.sum;
+                sb.append(String.format("%s=%.1f/%.1f/%.1f", name, min / 10.0, Math.round((double) sum2 / count) / 10.0, max / 10.0));
+            }
+            System.out.print(sb);
+            System.out.println('}');
+        }
+    }
+
+    public static int ceilPow2(int i) {
+        i--;
+        i |= i >> 1;
+        i |= i >> 2;
+        i |= i >> 4;
+        i |= i >> 8;
+        i |= i >> 16;
+        return i + 1;
+    }
+
+    private static class Processor implements Runnable {
+        private static final int MAP_SLOT_COUNT = ceilPow2(10000);
+        private static final int STATION_MAX_NAME_BYTES = 104;
+
+        private static final long COUNT_OFFSET = 0;
+        private static final long MIN_OFFSET = 4;
+        private static final long MAX_OFFSET = 8;
+        private static final long SUM_OFFSET = 12;
+        private static final long LEN_OFFSET = 20;
+        private static final long NAME_OFFSET = 24;
+
+        private static final int MAP_ENTRY_SIZE_BYTES = +Integer.BYTES // count // 0
+                + Integer.BYTES // min // +4
+                + Integer.BYTES // max // +8
+                + Long.BYTES // sum // +12
+                + Integer.BYTES // station name len // +20
+                + STATION_MAX_NAME_BYTES; // +24
+
+        private static final int MAP_SIZE_BYTES = MAP_SLOT_COUNT * MAP_ENTRY_SIZE_BYTES;
+        private static final long MAP_MASK = MAP_SLOT_COUNT - 1;
+
+        // todo: some fields could probably be converted to locals
+
+        private final long map;
+
+        private long cursorA;
+        private long endA;
+        private long cursorB;
+        private long endB;
+        private long cursorC;
+        private long endC;
+        private long cursorD;
+        private long endD;
+        private long maskA;
+        private long maskB;
+        private long maskC;
+        private long maskD;
+
+        // credit: merykitty
+        private long parseAndStoreTemperature(long startCursor, long baseEntryPtr) {
+            long word = UNSAFE.getLong(startCursor);
+            final long negateda = ~word;
+            final int dotPos = Long.numberOfTrailingZeros(negateda & 0x10101000);
+            final long signed = (negateda << 59) >> 63;
+            final long removeSignMask = ~(signed & 0xFF);
+            final long digits = ((word & removeSignMask) << (28 - dotPos)) & 0x0F000F0F00L;
+            final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            final int temperature = (int) ((absValue ^ signed) - signed);
+
+            long countPtr = baseEntryPtr + COUNT_OFFSET;
+            long minPtr = baseEntryPtr + MIN_OFFSET;
+            long maxPtr = baseEntryPtr + MAX_OFFSET;
+            long sumPtr = baseEntryPtr + SUM_OFFSET;
+
+            int min = UNSAFE.getInt(minPtr);
+            int max = UNSAFE.getInt(maxPtr);
+            long sum = UNSAFE.getLong(sumPtr);
+            // try if min/max intrinsics are paying off
+            // maybe braching is better? the branch is becoming more predictable with
+            // each new sample.
+            max = Math.max(max, temperature);
+            min = Math.min(min, temperature);
+            sum += temperature;
+            UNSAFE.putInt(countPtr, UNSAFE.getInt(countPtr) + 1);
+            UNSAFE.putInt(minPtr, min);
+            UNSAFE.putInt(maxPtr, max);
+            UNSAFE.putLong(sumPtr, sum);
+            return startCursor + (dotPos / 8) + 3;
+        }
+
+        private static long getDelimiterMask(final long word) {
+            // credit royvanrijn
+            final long match = word ^ SEPARATOR_PATTERN;
+            return (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
+        }
+
+        // todo: immutability cost us in allocations, but that's probably peanuts in the grand scheme of things. still worth checking
+        // maybe JVM trusting Final in Records offsets it ..a test is needed
+        record StationStats(int min, int max, int count, long sum) {
+        }
+
+        void accumulateStatus(TreeMap<String, StationStats> accumulator) {
+            for (long baseAddress = map; baseAddress < map + MAP_SIZE_BYTES; baseAddress += MAP_ENTRY_SIZE_BYTES) {
+                long len = UNSAFE.getInt(baseAddress + LEN_OFFSET);
+                if (len == 0) {
+                    continue;
+                }
+                byte[] nameArr = new byte[(int) len];
+                long baseNameAddr = baseAddress + NAME_OFFSET;
+                for (int i = 0; i < len; i++) {
+                    nameArr[i] = UNSAFE.getByte(baseNameAddr + i);
+                }
+                String name = new String(nameArr);
+                int min = UNSAFE.getInt(baseAddress + MIN_OFFSET);
+                int max = UNSAFE.getInt(baseAddress + MAX_OFFSET);
+                int count = UNSAFE.getInt(baseAddress + COUNT_OFFSET);
+                long sum = UNSAFE.getLong(baseAddress + SUM_OFFSET);
+
+                // todo: lambdas bootstrap probably cost us
+                accumulator.compute(name, (_, v) -> {
+                    if (v == null) {
+                        return new StationStats(min, max, count, sum);
+                    }
+                    return new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum);
+                });
+            }
+        }
+
+        Processor(long startA, long endA, long startB, long endB, long startC, long endC, long startD, long endD) {
+            this.cursorA = startA;
+            this.cursorB = startB;
+            this.cursorC = startC;
+            this.cursorD = startD;
+            this.endA = endA;
+            this.endB = endB;
+            this.endC = endC;
+            this.endD = endD;
+            this.map = UNSAFE.allocateMemory(MAP_SIZE_BYTES);
+
+            int i;
+            for (i = 0; i < MAP_SIZE_BYTES; i += 8) {
+                UNSAFE.putLong(map + i, 0);
+            }
+            for (i = i - 8; i < MAP_SIZE_BYTES; i++) {
+                UNSAFE.putByte(map + i, (byte) 0);
+            }
+        }
+
+        private void doTail() {
+            // todo: we would be probably better of without all that code dup. ("compilers hates him!")
+            // System.out.println("done ILP");
+            while (cursorA < endA) {
+                long startA = cursorA;
+                long delimiterWordA = UNSAFE.getLong(cursorA);
+                long hashA = 0;
+                maskA = getDelimiterMask(delimiterWordA);
+                while (maskA == 0) {
+                    hashA ^= delimiterWordA;
+                    cursorA += 8;
+                    delimiterWordA = UNSAFE.getLong(cursorA);
+                    maskA = getDelimiterMask(delimiterWordA);
+                }
+                final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
+                final long semicolonA = cursorA + (delimiterByteA >> 3);
+                final long maskedWordA = delimiterWordA & ((maskA >>> 7) - 1);
+                hashA ^= maskedWordA;
+                int intHashA = (int) (hashA ^ (hashA >> 32));
+                intHashA = intHashA ^ (intHashA >> 17);
+
+                long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, intHashA, maskedWordA);
+                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA);
+            }
+            // System.out.println("done A");
+            while (cursorB < endB) {
+                long startB = cursorB;
+                long delimiterWordB = UNSAFE.getLong(cursorB);
+                long hashB = 0;
+                maskB = getDelimiterMask(delimiterWordB);
+                while (maskB == 0) {
+                    hashB ^= delimiterWordB;
+                    cursorB += 8;
+                    delimiterWordB = UNSAFE.getLong(cursorB);
+                    maskB = getDelimiterMask(delimiterWordB);
+                }
+                final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
+                final long semicolonB = cursorB + (delimiterByteB >> 3);
+                final long maskedWordB = delimiterWordB & ((maskB >>> 7) - 1);
+                hashB ^= maskedWordB;
+                int intHashB = (int) (hashB ^ (hashB >> 32));
+                intHashB = intHashB ^ (intHashB >> 17);
+
+                long baseEntryPtrB = getOrCreateEntryBaseOffset(semicolonB, startB, intHashB, maskedWordB);
+                cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB);
+            }
+            // System.out.println("done B");
+            while (cursorC < endC) {
+                long startC = cursorC;
+                long delimiterWordC = UNSAFE.getLong(cursorC);
+                long hashC = 0;
+                maskC = getDelimiterMask(delimiterWordC);
+                while (maskC == 0) {
+                    hashC ^= delimiterWordC;
+                    cursorC += 8;
+                    delimiterWordC = UNSAFE.getLong(cursorC);
+                    maskC = getDelimiterMask(delimiterWordC);
+                }
+                final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
+                final long semicolonC = cursorC + (delimiterByteC >> 3);
+                final long maskedWordC = delimiterWordC & ((maskC >>> 7) - 1);
+                hashC ^= maskedWordC;
+                int intHashC = (int) (hashC ^ (hashC >> 32));
+                intHashC = intHashC ^ (intHashC >> 17);
+
+                long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, intHashC, maskedWordC);
+                cursorC = parseAndStoreTemperature(semicolonC + 1, baseEntryPtrC);
+            }
+            // System.out.println("done C");
+            while (cursorD < endD) {
+                long startD = cursorD;
+                long delimiterWordD = UNSAFE.getLong(cursorD);
+                long hashD = 0;
+                maskD = getDelimiterMask(delimiterWordD);
+                while (maskD == 0) {
+                    hashD ^= delimiterWordD;
+                    cursorD += 8;
+                    delimiterWordD = UNSAFE.getLong(cursorD);
+                    maskD = getDelimiterMask(delimiterWordD);
+                }
+                final int delimiterByteD = Long.numberOfTrailingZeros(maskD);
+                final long semicolonD = cursorD + (delimiterByteD >> 3);
+                final long maskedWordD = delimiterWordD & ((maskD >>> 7) - 1);
+                hashD ^= maskedWordD;
+                int intHashD = (int) (hashD ^ (hashD >> 32));
+                intHashD = intHashD ^ (intHashD >> 17);
+
+                long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, intHashD, maskedWordD);
+                cursorD = parseAndStoreTemperature(semicolonD + 1, baseEntryPtrD);
+            }
+            // System.out.println("done D");
+        }
+
+        @Override
+        public void run() {
+            while (cursorA < endA && cursorB < endB && cursorC < endC && cursorD < endD) {
+                // todo: experiment with different inter-leaving
+                long startA = cursorA;
+                long startB = cursorB;
+                long startC = cursorC;
+                long startD = cursorD;
+
+                long delimiterWordA = UNSAFE.getLong(cursorA);
+                long delimiterWordB = UNSAFE.getLong(cursorB);
+                long delimiterWordC = UNSAFE.getLong(cursorC);
+                long delimiterWordD = UNSAFE.getLong(cursorD);
+
+                long hashA = 0;
+                long hashB = 0;
+                long hashC = 0;
+                long hashD = 0;
+
+                // credits for the hashing idea: royvanrijn
+                maskA = getDelimiterMask(delimiterWordA);
+                while (maskA == 0) {
+                    hashA ^= delimiterWordA;
+                    cursorA += 8;
+                    delimiterWordA = UNSAFE.getLong(cursorA);
+                    maskA = getDelimiterMask(delimiterWordA);
+                }
+                final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
+                final long semicolonA = cursorA + (delimiterByteA >> 3);
+                final long maskedWordA = delimiterWordA & ((maskA >>> 7) - 1);
+                hashA ^= maskedWordA;
+                int intHashA = (int) (hashA ^ (hashA >> 32));
+                intHashA = intHashA ^ (intHashA >> 17);
+
+                maskB = getDelimiterMask(delimiterWordB);
+                while (maskB == 0) {
+                    hashB ^= delimiterWordB;
+                    cursorB += 8;
+                    delimiterWordB = UNSAFE.getLong(cursorB);
+                    maskB = getDelimiterMask(delimiterWordB);
+                }
+                final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
+                final long semicolonB = cursorB + (delimiterByteB >> 3);
+                final long maskedWordB = delimiterWordB & ((maskB >>> 7) - 1);
+                hashB ^= maskedWordB;
+                int intHashB = (int) (hashB ^ (hashB >> 32));
+                intHashB = intHashB ^ (intHashB >> 17);
+
+                maskC = getDelimiterMask(delimiterWordC);
+                while (maskC == 0) {
+                    hashC ^= delimiterWordC;
+                    cursorC += 8;
+                    delimiterWordC = UNSAFE.getLong(cursorC);
+                    maskC = getDelimiterMask(delimiterWordC);
+                }
+                final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
+                final long semicolonC = cursorC + (delimiterByteC >> 3);
+                final long maskedWordC = delimiterWordC & ((maskC >>> 7) - 1);
+                hashC ^= maskedWordC;
+                int intHashC = (int) (hashC ^ (hashC >> 32));
+                intHashC = intHashC ^ (intHashC >> 17);
+
+                maskD = getDelimiterMask(delimiterWordD);
+                while (maskD == 0) {
+                    hashD ^= delimiterWordD;
+                    cursorD += 8;
+                    delimiterWordD = UNSAFE.getLong(cursorD);
+                    maskD = getDelimiterMask(delimiterWordD);
+                }
+                final int delimiterByteD = Long.numberOfTrailingZeros(maskD);
+                final long semicolonD = cursorD + (delimiterByteD >> 3);
+                final long maskedWordD = delimiterWordD & ((maskD >>> 7) - 1);
+                hashD ^= maskedWordD;
+                int intHashD = (int) (hashD ^ (hashD >> 32));
+                intHashD = intHashD ^ (intHashD >> 17);
+
+                long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, intHashA, maskedWordA);
+                long baseEntryPtrB = getOrCreateEntryBaseOffset(semicolonB, startB, intHashB, maskedWordB);
+                long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, intHashC, maskedWordC);
+                long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, intHashD, maskedWordD);
+
+                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA);
+                cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB);
+                cursorC = parseAndStoreTemperature(semicolonC + 1, baseEntryPtrC);
+                cursorD = parseAndStoreTemperature(semicolonD + 1, baseEntryPtrD);
+            }
+            doTail();
+        }
+
+        private long getOrCreateEntryBaseOffset(long semicolonA, long startA, int intHashA, long maskedWordA) {
+            int lenA = (int) (semicolonA - startA);
+            long mapIndexA = intHashA & MAP_MASK;
+            for (;;) {
+                long basePtr = mapIndexA * MAP_ENTRY_SIZE_BYTES + map;
+                long lenPtr = basePtr + LEN_OFFSET;
+                int len = UNSAFE.getInt(lenPtr);
+                if (len == 0) {
+                    // todo: uncommon branch maybe?
+                    // empty slot
+                    UNSAFE.copyMemory(semicolonA - lenA, basePtr + NAME_OFFSET, lenA);
+                    UNSAFE.putInt(lenPtr, lenA);
+                    UNSAFE.putInt(basePtr + MAX_OFFSET, Integer.MIN_VALUE);
+                    UNSAFE.putInt(basePtr + MIN_OFFSET, Integer.MAX_VALUE);
+                    return basePtr;
+                }
+                if (len == lenA) {
+                    boolean match = true;
+                    long namePtr = basePtr + NAME_OFFSET;
+                    int fullLen = (len >> 3) << 3;
+                    long offset;
+                    // todo: this is worth exploring further.
+                    // @mtopolnik has an interesting algo with 2 unconditioned long loads: this is sufficient
+                    // for majority of names. so we would be left with just a single branch which is almost never taken?
+                    for (offset = 0; offset < fullLen; offset += 8) {
+                        match &= (UNSAFE.getLong(startA + offset) == UNSAFE.getLong(namePtr + offset));
+                    }
+
+                    long maskedWordInMap = UNSAFE.getLong(namePtr + offset);
+                    match &= (maskedWordInMap == maskedWordA);
+
+                    if (match) {
+                        return basePtr;
+                    }
+                }
+                mapIndexA = ++mapIndexA & MAP_MASK;
+            }
+        }
+    }
+
+}

From eaa4050a1b479aaeac3ac2ea1caf8b5da1bbd42d Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <ianopolous@protonmail.com>
Date: Mon, 15 Jan 2024 17:58:23 +0000
Subject: [PATCH 020/268] 12s (25%) faster on 4 core i7 (#421)

---
 .../onebrc/CalculateAverage_ianopolous.java   | 120 ++++++++++--------
 1 file changed, 64 insertions(+), 56 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java
index 834de7460..4d82d8809 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java
@@ -18,85 +18,88 @@
 import java.io.*;
 import java.nio.*;
 import java.nio.channels.*;
+import java.util.concurrent.*;
 import java.util.stream.*;
 import java.util.*;
 
-/* A simple implementation that memory maps the file, reads chunks in parallel and minimises allocation without any unsafe.
+/* A simple implementation aiming for readability.
+ * Features:
+ * * memory mapped file
+ * * read chunks in parallel
+ * * minimise allocation
+ * * no unsafe
  *
  * Timings on 4 core i7-7500U CPU @ 2.70GHz:
  * average_baseline: 4m48s
- * ianopolous:         48s
+ * ianopolous:         36s
 */
 public class CalculateAverage_ianopolous {
 
     public static final int MAX_LINE_LENGTH = 107;
-    public static final int MAX_STATIONS = 10000;
+    public static final int MAX_STATIONS = 10_000;
 
-    public static void main(String[] args) {
+    public static void main(String[] args) throws Exception {
         File input = new File("./measurements.txt");
         long filesize = input.length();
-        long chunkSize = 256 * 1024 * 1024;
+        // keep chunk size between 256 MB and 1G (1 chunk for files < 256MB)
+        long chunkSize = Math.min(Math.max(filesize / 32, 256 * 1024 * 1024), 1024 * 1024 * 1024L);
         int nChunks = (int) ((filesize + chunkSize - 1) / chunkSize);
-        List<HashMap<String, Stat>> allResults = IntStream.range(0, nChunks).mapToObj(i -> {
-            HashMap<String, Stat> results = new HashMap(512);
-            parseStats(i * chunkSize, Math.min((i + 1) * chunkSize, filesize), results);
-            return results;
-        }).parallel().toList();
-        HashMap<String, Stat> result = allResults.getFirst();
-        for (int i = 1; i < allResults.size(); ++i) {
-            for (Map.Entry<String, Stat> entry : allResults.get(i).entrySet()) {
-                Stat current = result.putIfAbsent(entry.getKey(), entry.getValue());
-                if (current != null) {
-                    current.merge(entry.getValue());
-                }
-            }
-        }
-
-        System.out.println(new TreeMap<>(result));
-    }
+        ExecutorService pool = Executors.newVirtualThreadPerTaskExecutor();
+        List<Future<List<List<Stat>>>> allResults = IntStream.range(0, nChunks)
+                .mapToObj(i -> pool.submit(() -> parseStats(i * chunkSize, Math.min((i + 1) * chunkSize, filesize))))
+                .toList();
 
-    public record Station(String name, ByteBuffer buf) {
+        TreeMap<String, Stat> merged = allResults.stream()
+                .parallel()
+                .flatMap(f -> {
+                    try {
+                        return f.get().stream().filter(Objects::nonNull).flatMap(Collection::stream);
+                    }
+                    catch (Exception e) {
+                        return Stream.empty();
+                    }
+                })
+                .collect(Collectors.toMap(s -> s.name(), s -> s, (a, b) -> a.merge(b), TreeMap::new));
+        System.out.println(merged);
     }
 
-    public static boolean matchingStationBytes(int start, int end, MappedByteBuffer buffer, Station existing) {
-        buffer.position(start);
+    public static boolean matchingStationBytes(int start, int end, MappedByteBuffer buffer, Stat existing) {
         for (int i = start; i < end; i++) {
-            if (existing.buf.get(i - start) != buffer.get(i))
+            if (existing.name[i - start] != buffer.get(i))
                 return false;
         }
         return true;
     }
 
-    public static Station parseStation(int start, int end, int hash, MappedByteBuffer buffer, List<List<Station>> stations) {
+    public static Stat parseStation(int start, int end, int hash, MappedByteBuffer buffer, List<List<Stat>> stations) {
         int index = Math.floorMod(hash, MAX_STATIONS);
-        List<Station> matches = stations.get(index);
+        List<Stat> matches = stations.get(index);
         if (matches == null) {
-            List<Station> value = new ArrayList<>();
+            List<Stat> value = new ArrayList<>();
             byte[] stationBuffer = new byte[end - start];
             buffer.position(start);
             buffer.get(stationBuffer);
-            String name = new String(stationBuffer);
-            Station res = new Station(name, ByteBuffer.wrap(stationBuffer));
+            Stat res = new Stat(stationBuffer);
             value.add(res);
             stations.set(index, value);
             return res;
         }
         else {
             for (int i = 0; i < matches.size(); i++) {
-                Station s = matches.get(i);
+                Stat s = matches.get(i);
                 if (matchingStationBytes(start, end, buffer, s))
                     return s;
             }
             byte[] stationBuffer = new byte[end - start];
             buffer.position(start);
             buffer.get(stationBuffer);
-            Station res = new Station(new String(stationBuffer), ByteBuffer.wrap(stationBuffer));
+            Stat res = new Stat(stationBuffer);
             matches.add(res);
             return res;
         }
     }
 
-    public static void parseStats(long startByte, long endByte, Map<String, Stat> results) {
+    public static List<List<Stat>> parseStats(long startByte, long endByte) {
         try {
             RandomAccessFile file = new RandomAccessFile("./measurements.txt", "r");
             long maxEnd = Math.min(file.length(), endByte + MAX_LINE_LENGTH);
@@ -117,30 +120,22 @@ public static void parseStats(long startByte, long endByte, Map<String, Stat> re
                 }
             }
 
-            List<List<Station>> stations = new ArrayList<>(MAX_STATIONS);
+            List<List<Stat>> stations = new ArrayList<>(MAX_STATIONS);
             for (int i = 0; i < MAX_STATIONS; i++)
                 stations.add(null);
             int lineStart = done;
             int lineSplit = 0;
-            long temperature = 0;
+            short temperature = 0;
             int hash = 1;
             boolean negative = false;
             while (done < maxDone) {
-                Station station = null;
+                Stat station = null;
                 for (int i = done; i < done + MAX_LINE_LENGTH && i < maxEnd; i++) {
                     byte b = buffer.get(i);
                     if (b == '\n') {
                         done = i + 1;
-                        Stat res = results.get(station.name);
-                        temperature = negative ? -temperature : temperature;
-                        if (res != null) {
-                            res.add(temperature);
-                        }
-                        else {
-                            res = new Stat();
-                            res.add(temperature);
-                            results.put(station.name, res);
-                        }
+                        temperature = negative ? (short) -temperature : temperature;
+                        station.add(temperature);
                         lineStart = done;
                         station = null;
                         hash = 1;
@@ -152,17 +147,18 @@ public static void parseStats(long startByte, long endByte, Map<String, Stat> re
                         temperature = 0;
                         negative = false;
                     }
-                    else if (b == '-' && station != null) {
-                        negative = true;
+                    else if (station == null) {
+                        hash = 31 * hash + b;
                     }
-                    else if (b != '.' && station != null) {
-                        temperature = temperature * 10 + (b - 0x30);
+                    else if (b == '-') {
+                        negative = true;
                     }
-                    else {
-                        hash = 31 * hash + b;
+                    else if (b != '.') {
+                        temperature = (short) (temperature * 10 + (b - 0x30));
                     }
                 }
             }
+            return stations;
         }
         catch (IOException e) {
             throw new RuntimeException(e);
@@ -170,9 +166,16 @@ else if (b != '.' && station != null) {
     }
 
     public static class Stat {
-        long min = Long.MAX_VALUE, max = Long.MIN_VALUE, total = 0, count = 0;
+        final byte[] name;
+        int count = 0;
+        short min = Short.MAX_VALUE, max = Short.MIN_VALUE;
+        long total = 0;
 
-        public void add(long value) {
+        public Stat(byte[] name) {
+            this.name = name;
+        }
+
+        public void add(short value) {
             if (value < min)
                 min = value;
             if (value > max)
@@ -181,19 +184,24 @@ public void add(long value) {
             count++;
         }
 
-        public void merge(Stat value) {
+        public Stat merge(Stat value) {
             if (value.min < min)
                 min = value.min;
             if (value.max > max)
                 max = value.max;
             total += value.total;
             count += value.count;
+            return this;
         }
 
         private static double round(double value) {
             return Math.round(value) / 10.0;
         }
 
+        public String name() {
+            return new String(name);
+        }
+
         public String toString() {
             return round((double) min) + "/" + round(((double) total) / count) + "/" + round((double) max);
         }

From 785e517c1445ed00ae6e00d8e7d95e1d903a1d47 Mon Sep 17 00:00:00 2001
From: eriklumme <29859656+eriklumme@users.noreply.github.com>
Date: Mon, 15 Jan 2024 20:03:51 +0200
Subject: [PATCH 021/268] CalculateAverage_eriklumme first submission (#221)

* Initial commit with custom implementation, 2:40

* Initial file-channel based version, 1:27

* Individual maps for executors, 0:54

* Use better-suited map: 0:34

* Verified correct, skip CharBuffer, :37

* Minor improvements and cleanup, 0:24

* String to byte[], 0:21

* Additional cleanup, use GraalVM, 0:17

* Faster number handling, 0:11

* Faster buffer reading, 0:08

* Prepare for environment with variable RAM and CPU, 0:08

* Fix bug causing issues with certain buffer sizes

* Larger overhead to not miss long station names that overlap buffers

* Reorder scripts and fix one-off bug
---
 calculate_average_eriklumme.sh                |  19 +
 prepare_eriklumme.sh                          |  19 +
 .../onebrc/CalculateAverage_eriklumme.java    | 373 ++++++++++++++++++
 3 files changed, 411 insertions(+)
 create mode 100755 calculate_average_eriklumme.sh
 create mode 100755 prepare_eriklumme.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java

diff --git a/calculate_average_eriklumme.sh b/calculate_average_eriklumme.sh
new file mode 100755
index 000000000..793af9b09
--- /dev/null
+++ b/calculate_average_eriklumme.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xms6g -Xmx6g"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_eriklumme
diff --git a/prepare_eriklumme.sh b/prepare_eriklumme.sh
new file mode 100755
index 000000000..f83a3ff69
--- /dev/null
+++ b/prepare_eriklumme.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java b/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java
new file mode 100644
index 000000000..768be4a25
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java
@@ -0,0 +1,373 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.FileInputStream;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+public class CalculateAverage_eriklumme {
+
+    private static final String FILE = "./measurements.txt";
+    private static final int NUM_CPUS = Runtime.getRuntime().availableProcessors();
+    private static final int LINE_OVERHEAD = 208;
+    private static final int NUM_TASKS = NUM_CPUS * 6;
+
+    private final CountDownLatch countDownLatch = new CountDownLatch(NUM_TASKS);
+
+    private final FileInputStream fileInputStream = new FileInputStream(FILE);
+    private final FileChannel fileChannel = fileInputStream.getChannel();
+    private final long fileSize = fileChannel.size();
+    private final int fileSizePerThread = (int) Math.max(Math.ceil(fileSize / (float) NUM_TASKS), 1000);
+
+    private CalculateAverage_eriklumme() throws Exception {
+        Map<ByteArrayWrapper, StationMeasurement> map = new HashMap<>();
+
+        try (ExecutorService executorService = Executors.newFixedThreadPool(NUM_CPUS); fileInputStream; fileChannel) {
+            long sizeAccountedFor = 0;
+
+            List<Future<Map<ByteArrayWrapper, StationMeasurement>>> futures = new ArrayList<>(NUM_TASKS);
+            for (int i = 0; i < NUM_TASKS; i++) {
+                if (sizeAccountedFor >= fileSize) {
+                    // The file is so small that because of the minimum file size per thread, we've covered it in less
+                    // threads than expected
+                    countDownLatch.countDown();
+                    continue;
+                }
+                futures.add(executorService.submit(new DataProcessor(i)));
+                sizeAccountedFor += fileSizePerThread;
+            }
+            countDownLatch.await();
+
+            for (Future<Map<ByteArrayWrapper, StationMeasurement>> future : futures) {
+                Map<ByteArrayWrapper, StationMeasurement> futureMap = future.get();
+                futureMap.forEach((key, value) -> map.merge(key, value,
+                        (st1, st2) -> {
+                            st1.sum += st2.sum;
+                            st1.count += st2.count;
+                            st1.min = Math.min(st1.min, st2.min);
+                            st1.max = Math.max(st1.max, st2.max);
+                            return st1;
+                        }));
+            }
+        }
+
+        StringBuilder result = new StringBuilder("{");
+        boolean first = true;
+        List<StationMeasurement> values = new ArrayList<>(map.values());
+        values.sort(Comparator.comparing(StationMeasurement::stringName));
+
+        for (StationMeasurement stationMeasurement : values) {
+            if (!first) {
+                result.append(", ");
+            }
+            first = false;
+            result.append(new String(stationMeasurement.stationName.value, StandardCharsets.UTF_8)).append("=");
+            result.append(DECIMAL_LOOKUP[stationMeasurement.min + 1000]);
+            result.append(String.format("/%.1f/", (stationMeasurement.sum / (stationMeasurement.count * 10.0))));
+            result.append(DECIMAL_LOOKUP[stationMeasurement.max + 1000]);
+        }
+        result.append("}");
+
+        System.out.println(result);
+    }
+
+    private static class StationMeasurement {
+        private final ByteArrayWrapper stationName;
+
+        private StationMeasurement(ByteArrayWrapper stationName) {
+            this.stationName = stationName;
+        }
+
+        private int min = Integer.MAX_VALUE;
+        private int max = Integer.MIN_VALUE;
+        private long sum = 0;
+        private int count = 0;
+
+        public String stringName() {
+            return new String(stationName.value, StandardCharsets.UTF_8);
+        }
+    }
+
+    private enum Mode {
+        UNINITIALIZED,
+        READ_STATION,
+        READ_VALUE
+    }
+
+    private record ByteArrayWrapper(byte[] value) {
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o)
+                return true;
+            if (o instanceof ByteArrayWrapper that) {
+                return Arrays.equals(value, that.value);
+            }
+            return false;
+        }
+
+        @Override
+        public int hashCode() {
+            return Arrays.hashCode(value);
+        }
+    }
+
+    public class DataProcessor implements Callable<Map<ByteArrayWrapper, StationMeasurement>> {
+
+        private final int processorIndex;
+
+        public DataProcessor(int processorIndex) {
+            this.processorIndex = processorIndex;
+        }
+
+        @Override
+        public Map<ByteArrayWrapper, StationMeasurement> call() throws Exception {
+            Map<ByteArrayWrapper, StationMeasurement> map = new HashMap<>();
+
+            byte[] stationBuffer = new byte[200];
+            int stationIndex = 0;
+
+            byte[] valueBuffer = new byte[10];
+            int valueIndex = 0;
+
+            Mode mode = processorIndex == 0 ? Mode.READ_STATION : Mode.UNINITIALIZED;
+            byte b;
+
+            long offset = ((long) fileSizePerThread) * processorIndex;
+            long sizeWithOverhead = Math.min(((long) fileSizePerThread) + LINE_OVERHEAD, fileSize - offset);
+
+            try {
+                MappedByteBuffer buffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, offset, sizeWithOverhead);
+                // Read from buffer in chunks for improved performance
+                byte[] bytes = new byte[(int) (sizeWithOverhead / 6) + 1];
+
+                while (buffer.hasRemaining()) {
+                    long bytesRemaining = sizeWithOverhead - buffer.position();
+                    int bytesOffset, bytesLength;
+                    if (bytesRemaining >= bytes.length) {
+                        bytesOffset = 0;
+                        bytesLength = bytes.length;
+                    }
+                    else {
+                        bytesOffset = (int) (bytes.length - bytesRemaining);
+                        bytesLength = (int) bytesRemaining;
+                    }
+                    buffer.get(bytes, bytesOffset, bytesLength);
+
+                    for (int i = bytesOffset; i < bytes.length; i++) {
+                        b = bytes[i];
+                        if (b == '\n') {
+                            // We have a station to store
+                            if (mode == Mode.READ_VALUE) {
+                                storeStation(map, stationBuffer, stationIndex, valueBuffer, valueIndex);
+                                stationIndex = 0;
+                                valueIndex = 0;
+                            }
+                            mode = Mode.READ_STATION;
+
+                            // We've run past our size, can happen
+                            if (buffer.position() - bytes.length + i >= fileSizePerThread) {
+                                return map;
+                            }
+                        }
+                        else if (mode == Mode.UNINITIALIZED) {
+                            // Do-nothing, read more
+                        }
+                        else if (b == ';') {
+                            mode = Mode.READ_VALUE;
+                        }
+                        else if (mode == Mode.READ_STATION) {
+                            stationBuffer[stationIndex++] = b;
+                        }
+                        else {
+                            valueBuffer[valueIndex++] = b;
+                        }
+                    }
+                }
+                if (mode == Mode.READ_VALUE && valueIndex > 0) {
+                    // One value left to store
+                    storeStation(map, stationBuffer, stationIndex, valueBuffer, valueIndex);
+                }
+            }
+            finally {
+                countDownLatch.countDown();
+            }
+            return map;
+        }
+
+        private void storeStation(Map<ByteArrayWrapper, StationMeasurement> map, byte[] stationBuffer, int stationIndex, byte[] valueBuffer, int valueIndex) {
+            ByteArrayWrapper stationName = new ByteArrayWrapper(Arrays.copyOfRange(stationBuffer, 0, stationIndex));
+
+            int value = 0;
+            for (int i = 0; i < valueIndex; i++) {
+                byte b = valueBuffer[valueIndex - i - 1];
+                if (i == 1) {
+                    // Skip the decimal point
+                }
+                else if (b == '-') {
+                    // Number is negative
+                    value = (-value);
+                }
+                else {
+                    int valueAtIndex = b - 48;
+                    if (i == 0) {
+                        value += valueAtIndex;
+                    }
+                    else {
+                        value += valueAtIndex * (i == 2 ? 10 : 100);
+                    }
+                }
+            }
+            StationMeasurement stationMeasurement = map.computeIfAbsent(stationName, StationMeasurement::new);
+            stationMeasurement.count++;
+            stationMeasurement.min = Math.min(value, stationMeasurement.min);
+            stationMeasurement.max = Math.max(value, stationMeasurement.max);
+            stationMeasurement.sum += value;
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        Locale.setDefault(Locale.US);
+        new CalculateAverage_eriklumme();
+    }
+
+    private static final String[] DECIMAL_LOOKUP = new String[]{
+            "-100.0", "-99.9", "-99.8", "-99.7", "-99.6", "-99.5", "-99.4", "-99.3", "-99.2", "-99.1", "-99.0", "-98.9", "-98.8", "-98.7", "-98.6", "-98.5", "-98.4",
+            "-98.3", "-98.2", "-98.1", "-98.0", "-97.9", "-97.8", "-97.7", "-97.6", "-97.5", "-97.4", "-97.3", "-97.2", "-97.1", "-97.0", "-96.9", "-96.8", "-96.7",
+            "-96.6", "-96.5", "-96.4", "-96.3", "-96.2", "-96.1", "-96.0", "-95.9", "-95.8", "-95.7", "-95.6", "-95.5", "-95.4", "-95.3", "-95.2", "-95.1", "-95.0",
+            "-94.9", "-94.8", "-94.7", "-94.6", "-94.5", "-94.4", "-94.3", "-94.2", "-94.1", "-94.0", "-93.9", "-93.8", "-93.7", "-93.6", "-93.5", "-93.4", "-93.3",
+            "-93.2", "-93.1", "-93.0", "-92.9", "-92.8", "-92.7", "-92.6", "-92.5", "-92.4", "-92.3", "-92.2", "-92.1", "-92.0", "-91.9", "-91.8", "-91.7", "-91.6",
+            "-91.5", "-91.4", "-91.3", "-91.2", "-91.1", "-91.0", "-90.9", "-90.8", "-90.7", "-90.6", "-90.5", "-90.4", "-90.3", "-90.2", "-90.1", "-90.0", "-89.9",
+            "-89.8", "-89.7", "-89.6", "-89.5", "-89.4", "-89.3", "-89.2", "-89.1", "-89.0", "-88.9", "-88.8", "-88.7", "-88.6", "-88.5", "-88.4", "-88.3", "-88.2",
+            "-88.1", "-88.0", "-87.9", "-87.8", "-87.7", "-87.6", "-87.5", "-87.4", "-87.3", "-87.2", "-87.1", "-87.0", "-86.9", "-86.8", "-86.7", "-86.6", "-86.5",
+            "-86.4", "-86.3", "-86.2", "-86.1", "-86.0", "-85.9", "-85.8", "-85.7", "-85.6", "-85.5", "-85.4", "-85.3", "-85.2", "-85.1", "-85.0", "-84.9", "-84.8",
+            "-84.7", "-84.6", "-84.5", "-84.4", "-84.3", "-84.2", "-84.1", "-84.0", "-83.9", "-83.8", "-83.7", "-83.6", "-83.5", "-83.4", "-83.3", "-83.2", "-83.1",
+            "-83.0", "-82.9", "-82.8", "-82.7", "-82.6", "-82.5", "-82.4", "-82.3", "-82.2", "-82.1", "-82.0", "-81.9", "-81.8", "-81.7", "-81.6", "-81.5", "-81.4",
+            "-81.3", "-81.2", "-81.1", "-81.0", "-80.9", "-80.8", "-80.7", "-80.6", "-80.5", "-80.4", "-80.3", "-80.2", "-80.1", "-80.0", "-79.9", "-79.8", "-79.7",
+            "-79.6", "-79.5", "-79.4", "-79.3", "-79.2", "-79.1", "-79.0", "-78.9", "-78.8", "-78.7", "-78.6", "-78.5", "-78.4", "-78.3", "-78.2", "-78.1", "-78.0",
+            "-77.9", "-77.8", "-77.7", "-77.6", "-77.5", "-77.4", "-77.3", "-77.2", "-77.1", "-77.0", "-76.9", "-76.8", "-76.7", "-76.6", "-76.5", "-76.4", "-76.3",
+            "-76.2", "-76.1", "-76.0", "-75.9", "-75.8", "-75.7", "-75.6", "-75.5", "-75.4", "-75.3", "-75.2", "-75.1", "-75.0", "-74.9", "-74.8", "-74.7", "-74.6",
+            "-74.5", "-74.4", "-74.3", "-74.2", "-74.1", "-74.0", "-73.9", "-73.8", "-73.7", "-73.6", "-73.5", "-73.4", "-73.3", "-73.2", "-73.1", "-73.0", "-72.9",
+            "-72.8", "-72.7", "-72.6", "-72.5", "-72.4", "-72.3", "-72.2", "-72.1", "-72.0", "-71.9", "-71.8", "-71.7", "-71.6", "-71.5", "-71.4", "-71.3", "-71.2",
+            "-71.1", "-71.0", "-70.9", "-70.8", "-70.7", "-70.6", "-70.5", "-70.4", "-70.3", "-70.2", "-70.1", "-70.0", "-69.9", "-69.8", "-69.7", "-69.6", "-69.5",
+            "-69.4", "-69.3", "-69.2", "-69.1", "-69.0", "-68.9", "-68.8", "-68.7", "-68.6", "-68.5", "-68.4", "-68.3", "-68.2", "-68.1", "-68.0", "-67.9", "-67.8",
+            "-67.7", "-67.6", "-67.5", "-67.4", "-67.3", "-67.2", "-67.1", "-67.0", "-66.9", "-66.8", "-66.7", "-66.6", "-66.5", "-66.4", "-66.3", "-66.2", "-66.1",
+            "-66.0", "-65.9", "-65.8", "-65.7", "-65.6", "-65.5", "-65.4", "-65.3", "-65.2", "-65.1", "-65.0", "-64.9", "-64.8", "-64.7", "-64.6", "-64.5", "-64.4",
+            "-64.3", "-64.2", "-64.1", "-64.0", "-63.9", "-63.8", "-63.7", "-63.6", "-63.5", "-63.4", "-63.3", "-63.2", "-63.1", "-63.0", "-62.9", "-62.8", "-62.7",
+            "-62.6", "-62.5", "-62.4", "-62.3", "-62.2", "-62.1", "-62.0", "-61.9", "-61.8", "-61.7", "-61.6", "-61.5", "-61.4", "-61.3", "-61.2", "-61.1", "-61.0",
+            "-60.9", "-60.8", "-60.7", "-60.6", "-60.5", "-60.4", "-60.3", "-60.2", "-60.1", "-60.0", "-59.9", "-59.8", "-59.7", "-59.6", "-59.5", "-59.4", "-59.3",
+            "-59.2", "-59.1", "-59.0", "-58.9", "-58.8", "-58.7", "-58.6", "-58.5", "-58.4", "-58.3", "-58.2", "-58.1", "-58.0", "-57.9", "-57.8", "-57.7", "-57.6",
+            "-57.5", "-57.4", "-57.3", "-57.2", "-57.1", "-57.0", "-56.9", "-56.8", "-56.7", "-56.6", "-56.5", "-56.4", "-56.3", "-56.2", "-56.1", "-56.0", "-55.9",
+            "-55.8", "-55.7", "-55.6", "-55.5", "-55.4", "-55.3", "-55.2", "-55.1", "-55.0", "-54.9", "-54.8", "-54.7", "-54.6", "-54.5", "-54.4", "-54.3", "-54.2",
+            "-54.1", "-54.0", "-53.9", "-53.8", "-53.7", "-53.6", "-53.5", "-53.4", "-53.3", "-53.2", "-53.1", "-53.0", "-52.9", "-52.8", "-52.7", "-52.6", "-52.5",
+            "-52.4", "-52.3", "-52.2", "-52.1", "-52.0", "-51.9", "-51.8", "-51.7", "-51.6", "-51.5", "-51.4", "-51.3", "-51.2", "-51.1", "-51.0", "-50.9", "-50.8",
+            "-50.7", "-50.6", "-50.5", "-50.4", "-50.3", "-50.2", "-50.1", "-50.0", "-49.9", "-49.8", "-49.7", "-49.6", "-49.5", "-49.4", "-49.3", "-49.2", "-49.1",
+            "-49.0", "-48.9", "-48.8", "-48.7", "-48.6", "-48.5", "-48.4", "-48.3", "-48.2", "-48.1", "-48.0", "-47.9", "-47.8", "-47.7", "-47.6", "-47.5", "-47.4",
+            "-47.3", "-47.2", "-47.1", "-47.0", "-46.9", "-46.8", "-46.7", "-46.6", "-46.5", "-46.4", "-46.3", "-46.2", "-46.1", "-46.0", "-45.9", "-45.8", "-45.7",
+            "-45.6", "-45.5", "-45.4", "-45.3", "-45.2", "-45.1", "-45.0", "-44.9", "-44.8", "-44.7", "-44.6", "-44.5", "-44.4", "-44.3", "-44.2", "-44.1", "-44.0",
+            "-43.9", "-43.8", "-43.7", "-43.6", "-43.5", "-43.4", "-43.3", "-43.2", "-43.1", "-43.0", "-42.9", "-42.8", "-42.7", "-42.6", "-42.5", "-42.4", "-42.3",
+            "-42.2", "-42.1", "-42.0", "-41.9", "-41.8", "-41.7", "-41.6", "-41.5", "-41.4", "-41.3", "-41.2", "-41.1", "-41.0", "-40.9", "-40.8", "-40.7", "-40.6",
+            "-40.5", "-40.4", "-40.3", "-40.2", "-40.1", "-40.0", "-39.9", "-39.8", "-39.7", "-39.6", "-39.5", "-39.4", "-39.3", "-39.2", "-39.1", "-39.0", "-38.9",
+            "-38.8", "-38.7", "-38.6", "-38.5", "-38.4", "-38.3", "-38.2", "-38.1", "-38.0", "-37.9", "-37.8", "-37.7", "-37.6", "-37.5", "-37.4", "-37.3", "-37.2",
+            "-37.1", "-37.0", "-36.9", "-36.8", "-36.7", "-36.6", "-36.5", "-36.4", "-36.3", "-36.2", "-36.1", "-36.0", "-35.9", "-35.8", "-35.7", "-35.6", "-35.5",
+            "-35.4", "-35.3", "-35.2", "-35.1", "-35.0", "-34.9", "-34.8", "-34.7", "-34.6", "-34.5", "-34.4", "-34.3", "-34.2", "-34.1", "-34.0", "-33.9", "-33.8",
+            "-33.7", "-33.6", "-33.5", "-33.4", "-33.3", "-33.2", "-33.1", "-33.0", "-32.9", "-32.8", "-32.7", "-32.6", "-32.5", "-32.4", "-32.3", "-32.2", "-32.1",
+            "-32.0", "-31.9", "-31.8", "-31.7", "-31.6", "-31.5", "-31.4", "-31.3", "-31.2", "-31.1", "-31.0", "-30.9", "-30.8", "-30.7", "-30.6", "-30.5", "-30.4",
+            "-30.3", "-30.2", "-30.1", "-30.0", "-29.9", "-29.8", "-29.7", "-29.6", "-29.5", "-29.4", "-29.3", "-29.2", "-29.1", "-29.0", "-28.9", "-28.8", "-28.7",
+            "-28.6", "-28.5", "-28.4", "-28.3", "-28.2", "-28.1", "-28.0", "-27.9", "-27.8", "-27.7", "-27.6", "-27.5", "-27.4", "-27.3", "-27.2", "-27.1", "-27.0",
+            "-26.9", "-26.8", "-26.7", "-26.6", "-26.5", "-26.4", "-26.3", "-26.2", "-26.1", "-26.0", "-25.9", "-25.8", "-25.7", "-25.6", "-25.5", "-25.4", "-25.3",
+            "-25.2", "-25.1", "-25.0", "-24.9", "-24.8", "-24.7", "-24.6", "-24.5", "-24.4", "-24.3", "-24.2", "-24.1", "-24.0", "-23.9", "-23.8", "-23.7", "-23.6",
+            "-23.5", "-23.4", "-23.3", "-23.2", "-23.1", "-23.0", "-22.9", "-22.8", "-22.7", "-22.6", "-22.5", "-22.4", "-22.3", "-22.2", "-22.1", "-22.0", "-21.9",
+            "-21.8", "-21.7", "-21.6", "-21.5", "-21.4", "-21.3", "-21.2", "-21.1", "-21.0", "-20.9", "-20.8", "-20.7", "-20.6", "-20.5", "-20.4", "-20.3", "-20.2",
+            "-20.1", "-20.0", "-19.9", "-19.8", "-19.7", "-19.6", "-19.5", "-19.4", "-19.3", "-19.2", "-19.1", "-19.0", "-18.9", "-18.8", "-18.7", "-18.6", "-18.5",
+            "-18.4", "-18.3", "-18.2", "-18.1", "-18.0", "-17.9", "-17.8", "-17.7", "-17.6", "-17.5", "-17.4", "-17.3", "-17.2", "-17.1", "-17.0", "-16.9", "-16.8",
+            "-16.7", "-16.6", "-16.5", "-16.4", "-16.3", "-16.2", "-16.1", "-16.0", "-15.9", "-15.8", "-15.7", "-15.6", "-15.5", "-15.4", "-15.3", "-15.2", "-15.1",
+            "-15.0", "-14.9", "-14.8", "-14.7", "-14.6", "-14.5", "-14.4", "-14.3", "-14.2", "-14.1", "-14.0", "-13.9", "-13.8", "-13.7", "-13.6", "-13.5", "-13.4",
+            "-13.3", "-13.2", "-13.1", "-13.0", "-12.9", "-12.8", "-12.7", "-12.6", "-12.5", "-12.4", "-12.3", "-12.2", "-12.1", "-12.0", "-11.9", "-11.8", "-11.7",
+            "-11.6", "-11.5", "-11.4", "-11.3", "-11.2", "-11.1", "-11.0", "-10.9", "-10.8", "-10.7", "-10.6", "-10.5", "-10.4", "-10.3", "-10.2", "-10.1", "-10.0",
+            "-9.9", "-9.8", "-9.7", "-9.6", "-9.5", "-9.4", "-9.3", "-9.2", "-9.1", "-9.0", "-8.9", "-8.8", "-8.7", "-8.6", "-8.5", "-8.4", "-8.3", "-8.2", "-8.1",
+            "-8.0", "-7.9", "-7.8", "-7.7", "-7.6", "-7.5", "-7.4", "-7.3", "-7.2", "-7.1", "-7.0", "-6.9", "-6.8", "-6.7", "-6.6", "-6.5", "-6.4", "-6.3", "-6.2",
+            "-6.1", "-6.0", "-5.9", "-5.8", "-5.7", "-5.6", "-5.5", "-5.4", "-5.3", "-5.2", "-5.1", "-5.0", "-4.9", "-4.8", "-4.7", "-4.6", "-4.5", "-4.4", "-4.3",
+            "-4.2", "-4.1", "-4.0", "-3.9", "-3.8", "-3.7", "-3.6", "-3.5", "-3.4", "-3.3", "-3.2", "-3.1", "-3.0", "-2.9", "-2.8", "-2.7", "-2.6", "-2.5", "-2.4",
+            "-2.3", "-2.2", "-2.1", "-2.0", "-1.9", "-1.8", "-1.7", "-1.6", "-1.5", "-1.4", "-1.3", "-1.2", "-1.1", "-1.0", "-0.9", "-0.8", "-0.7", "-0.6", "-0.5",
+            "-0.4", "-0.3", "-0.2", "-0.1", "0.0", "0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7",
+            "1.8", "1.9", "2.0", "2.1", "2.2", "2.3", "2.4", "2.5", "2.6", "2.7", "2.8", "2.9", "3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "3.6", "3.7", "3.8", "3.9",
+            "4.0", "4.1", "4.2", "4.3", "4.4", "4.5", "4.6", "4.7", "4.8", "4.9", "5.0", "5.1", "5.2", "5.3", "5.4", "5.5", "5.6", "5.7", "5.8", "5.9", "6.0", "6.1",
+            "6.2", "6.3", "6.4", "6.5", "6.6", "6.7", "6.8", "6.9", "7.0", "7.1", "7.2", "7.3", "7.4", "7.5", "7.6", "7.7", "7.8", "7.9", "8.0", "8.1", "8.2", "8.3",
+            "8.4", "8.5", "8.6", "8.7", "8.8", "8.9", "9.0", "9.1", "9.2", "9.3", "9.4", "9.5", "9.6", "9.7", "9.8", "9.9", "10.0", "10.1", "10.2", "10.3", "10.4",
+            "10.5", "10.6", "10.7", "10.8", "10.9", "11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "11.9", "12.0", "12.1", "12.2", "12.3",
+            "12.4", "12.5", "12.6", "12.7", "12.8", "12.9", "13.0", "13.1", "13.2", "13.3", "13.4", "13.5", "13.6", "13.7", "13.8", "13.9", "14.0", "14.1", "14.2",
+            "14.3", "14.4", "14.5", "14.6", "14.7", "14.8", "14.9", "15.0", "15.1", "15.2", "15.3", "15.4", "15.5", "15.6", "15.7", "15.8", "15.9", "16.0", "16.1",
+            "16.2", "16.3", "16.4", "16.5", "16.6", "16.7", "16.8", "16.9", "17.0", "17.1", "17.2", "17.3", "17.4", "17.5", "17.6", "17.7", "17.8", "17.9", "18.0",
+            "18.1", "18.2", "18.3", "18.4", "18.5", "18.6", "18.7", "18.8", "18.9", "19.0", "19.1", "19.2", "19.3", "19.4", "19.5", "19.6", "19.7", "19.8", "19.9",
+            "20.0", "20.1", "20.2", "20.3", "20.4", "20.5", "20.6", "20.7", "20.8", "20.9", "21.0", "21.1", "21.2", "21.3", "21.4", "21.5", "21.6", "21.7", "21.8",
+            "21.9", "22.0", "22.1", "22.2", "22.3", "22.4", "22.5", "22.6", "22.7", "22.8", "22.9", "23.0", "23.1", "23.2", "23.3", "23.4", "23.5", "23.6", "23.7",
+            "23.8", "23.9", "24.0", "24.1", "24.2", "24.3", "24.4", "24.5", "24.6", "24.7", "24.8", "24.9", "25.0", "25.1", "25.2", "25.3", "25.4", "25.5", "25.6",
+            "25.7", "25.8", "25.9", "26.0", "26.1", "26.2", "26.3", "26.4", "26.5", "26.6", "26.7", "26.8", "26.9", "27.0", "27.1", "27.2", "27.3", "27.4", "27.5",
+            "27.6", "27.7", "27.8", "27.9", "28.0", "28.1", "28.2", "28.3", "28.4", "28.5", "28.6", "28.7", "28.8", "28.9", "29.0", "29.1", "29.2", "29.3", "29.4",
+            "29.5", "29.6", "29.7", "29.8", "29.9", "30.0", "30.1", "30.2", "30.3", "30.4", "30.5", "30.6", "30.7", "30.8", "30.9", "31.0", "31.1", "31.2", "31.3",
+            "31.4", "31.5", "31.6", "31.7", "31.8", "31.9", "32.0", "32.1", "32.2", "32.3", "32.4", "32.5", "32.6", "32.7", "32.8", "32.9", "33.0", "33.1", "33.2",
+            "33.3", "33.4", "33.5", "33.6", "33.7", "33.8", "33.9", "34.0", "34.1", "34.2", "34.3", "34.4", "34.5", "34.6", "34.7", "34.8", "34.9", "35.0", "35.1",
+            "35.2", "35.3", "35.4", "35.5", "35.6", "35.7", "35.8", "35.9", "36.0", "36.1", "36.2", "36.3", "36.4", "36.5", "36.6", "36.7", "36.8", "36.9", "37.0",
+            "37.1", "37.2", "37.3", "37.4", "37.5", "37.6", "37.7", "37.8", "37.9", "38.0", "38.1", "38.2", "38.3", "38.4", "38.5", "38.6", "38.7", "38.8", "38.9",
+            "39.0", "39.1", "39.2", "39.3", "39.4", "39.5", "39.6", "39.7", "39.8", "39.9", "40.0", "40.1", "40.2", "40.3", "40.4", "40.5", "40.6", "40.7", "40.8",
+            "40.9", "41.0", "41.1", "41.2", "41.3", "41.4", "41.5", "41.6", "41.7", "41.8", "41.9", "42.0", "42.1", "42.2", "42.3", "42.4", "42.5", "42.6", "42.7",
+            "42.8", "42.9", "43.0", "43.1", "43.2", "43.3", "43.4", "43.5", "43.6", "43.7", "43.8", "43.9", "44.0", "44.1", "44.2", "44.3", "44.4", "44.5", "44.6",
+            "44.7", "44.8", "44.9", "45.0", "45.1", "45.2", "45.3", "45.4", "45.5", "45.6", "45.7", "45.8", "45.9", "46.0", "46.1", "46.2", "46.3", "46.4", "46.5",
+            "46.6", "46.7", "46.8", "46.9", "47.0", "47.1", "47.2", "47.3", "47.4", "47.5", "47.6", "47.7", "47.8", "47.9", "48.0", "48.1", "48.2", "48.3", "48.4",
+            "48.5", "48.6", "48.7", "48.8", "48.9", "49.0", "49.1", "49.2", "49.3", "49.4", "49.5", "49.6", "49.7", "49.8", "49.9", "50.0", "50.1", "50.2", "50.3",
+            "50.4", "50.5", "50.6", "50.7", "50.8", "50.9", "51.0", "51.1", "51.2", "51.3", "51.4", "51.5", "51.6", "51.7", "51.8", "51.9", "52.0", "52.1", "52.2",
+            "52.3", "52.4", "52.5", "52.6", "52.7", "52.8", "52.9", "53.0", "53.1", "53.2", "53.3", "53.4", "53.5", "53.6", "53.7", "53.8", "53.9", "54.0", "54.1",
+            "54.2", "54.3", "54.4", "54.5", "54.6", "54.7", "54.8", "54.9", "55.0", "55.1", "55.2", "55.3", "55.4", "55.5", "55.6", "55.7", "55.8", "55.9", "56.0",
+            "56.1", "56.2", "56.3", "56.4", "56.5", "56.6", "56.7", "56.8", "56.9", "57.0", "57.1", "57.2", "57.3", "57.4", "57.5", "57.6", "57.7", "57.8", "57.9",
+            "58.0", "58.1", "58.2", "58.3", "58.4", "58.5", "58.6", "58.7", "58.8", "58.9", "59.0", "59.1", "59.2", "59.3", "59.4", "59.5", "59.6", "59.7", "59.8",
+            "59.9", "60.0", "60.1", "60.2", "60.3", "60.4", "60.5", "60.6", "60.7", "60.8", "60.9", "61.0", "61.1", "61.2", "61.3", "61.4", "61.5", "61.6", "61.7",
+            "61.8", "61.9", "62.0", "62.1", "62.2", "62.3", "62.4", "62.5", "62.6", "62.7", "62.8", "62.9", "63.0", "63.1", "63.2", "63.3", "63.4", "63.5", "63.6",
+            "63.7", "63.8", "63.9", "64.0", "64.1", "64.2", "64.3", "64.4", "64.5", "64.6", "64.7", "64.8", "64.9", "65.0", "65.1", "65.2", "65.3", "65.4", "65.5",
+            "65.6", "65.7", "65.8", "65.9", "66.0", "66.1", "66.2", "66.3", "66.4", "66.5", "66.6", "66.7", "66.8", "66.9", "67.0", "67.1", "67.2", "67.3", "67.4",
+            "67.5", "67.6", "67.7", "67.8", "67.9", "68.0", "68.1", "68.2", "68.3", "68.4", "68.5", "68.6", "68.7", "68.8", "68.9", "69.0", "69.1", "69.2", "69.3",
+            "69.4", "69.5", "69.6", "69.7", "69.8", "69.9", "70.0", "70.1", "70.2", "70.3", "70.4", "70.5", "70.6", "70.7", "70.8", "70.9", "71.0", "71.1", "71.2",
+            "71.3", "71.4", "71.5", "71.6", "71.7", "71.8", "71.9", "72.0", "72.1", "72.2", "72.3", "72.4", "72.5", "72.6", "72.7", "72.8", "72.9", "73.0", "73.1",
+            "73.2", "73.3", "73.4", "73.5", "73.6", "73.7", "73.8", "73.9", "74.0", "74.1", "74.2", "74.3", "74.4", "74.5", "74.6", "74.7", "74.8", "74.9", "75.0",
+            "75.1", "75.2", "75.3", "75.4", "75.5", "75.6", "75.7", "75.8", "75.9", "76.0", "76.1", "76.2", "76.3", "76.4", "76.5", "76.6", "76.7", "76.8", "76.9",
+            "77.0", "77.1", "77.2", "77.3", "77.4", "77.5", "77.6", "77.7", "77.8", "77.9", "78.0", "78.1", "78.2", "78.3", "78.4", "78.5", "78.6", "78.7", "78.8",
+            "78.9", "79.0", "79.1", "79.2", "79.3", "79.4", "79.5", "79.6", "79.7", "79.8", "79.9", "80.0", "80.1", "80.2", "80.3", "80.4", "80.5", "80.6", "80.7",
+            "80.8", "80.9", "81.0", "81.1", "81.2", "81.3", "81.4", "81.5", "81.6", "81.7", "81.8", "81.9", "82.0", "82.1", "82.2", "82.3", "82.4", "82.5", "82.6",
+            "82.7", "82.8", "82.9", "83.0", "83.1", "83.2", "83.3", "83.4", "83.5", "83.6", "83.7", "83.8", "83.9", "84.0", "84.1", "84.2", "84.3", "84.4", "84.5",
+            "84.6", "84.7", "84.8", "84.9", "85.0", "85.1", "85.2", "85.3", "85.4", "85.5", "85.6", "85.7", "85.8", "85.9", "86.0", "86.1", "86.2", "86.3", "86.4",
+            "86.5", "86.6", "86.7", "86.8", "86.9", "87.0", "87.1", "87.2", "87.3", "87.4", "87.5", "87.6", "87.7", "87.8", "87.9", "88.0", "88.1", "88.2", "88.3",
+            "88.4", "88.5", "88.6", "88.7", "88.8", "88.9", "89.0", "89.1", "89.2", "89.3", "89.4", "89.5", "89.6", "89.7", "89.8", "89.9", "90.0", "90.1", "90.2",
+            "90.3", "90.4", "90.5", "90.6", "90.7", "90.8", "90.9", "91.0", "91.1", "91.2", "91.3", "91.4", "91.5", "91.6", "91.7", "91.8", "91.9", "92.0", "92.1",
+            "92.2", "92.3", "92.4", "92.5", "92.6", "92.7", "92.8", "92.9", "93.0", "93.1", "93.2", "93.3", "93.4", "93.5", "93.6", "93.7", "93.8", "93.9", "94.0",
+            "94.1", "94.2", "94.3", "94.4", "94.5", "94.6", "94.7", "94.8", "94.9", "95.0", "95.1", "95.2", "95.3", "95.4", "95.5", "95.6", "95.7", "95.8", "95.9",
+            "96.0", "96.1", "96.2", "96.3", "96.4", "96.5", "96.6", "96.7", "96.8", "96.9", "97.0", "97.1", "97.2", "97.3", "97.4", "97.5", "97.6", "97.7", "97.8",
+            "97.9", "98.0", "98.1", "98.2", "98.3", "98.4", "98.5", "98.6", "98.7", "98.8", "98.9", "99.0", "99.1", "99.2", "99.3", "99.4", "99.5", "99.6", "99.7",
+            "99.8", "99.9" };
+}

From ca075b66f2c2fee099e5e84240eef67fc002d162 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 15 Jan 2024 19:04:29 +0100
Subject: [PATCH 022/268] Leaderboard update

---
 README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d9f882328..6e765cf74 100644
--- a/README.md
+++ b/README.md
@@ -44,12 +44,13 @@ These are the results from running all entries into the challenge on eight cores
 | 1 | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
 | 2 | 00:02.708 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
 | 3 | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+|   | 00:02.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.321 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
-|   | 00:03.539 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
+|   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
+|   | 00:03.409 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
-|   | 00:04.362 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:04.726 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  | 
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
@@ -86,6 +87,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:09.117 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
 |   | 00:09.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java)| 21.0.1-graal | [Filip Hrisafov](https://github.com/filiphr) |  |
 |   | 00:09.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
+|   | 00:10.092 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-graal | [Pratham](https://github.com/phd3) |  |
 |   | 00:10.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
 |   | 00:10.553 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) |  |
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
@@ -101,12 +103,13 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.565 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
-|   | 00:13.623 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-open | [Pratham](https://github.com/phd3) |  |
+|   | 00:13.763 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
+|   | 00:14.225 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
+|   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
-|   | 00:16.379 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.717 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_omarchenko4j.java)| 21.0.1-open | [Oleh Marchenko](https://github.com/omarchenko4j) |  |
@@ -142,7 +145,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 01:14.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anandmattikopp.java)| 21.0.1-open | [twohardthings](https://github.com/anandmattikopp) |  |
 |   | 01:25.801 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ivanklaric.java)| 21.0.1-open | [ivanklaric](https://github.com/ivanklaric) |  |
 |   | 01:33.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnmathur.java)| 21.0.1-open | [Gaurav Mathur](https://github.com/gnmathur) |  |
-|   | 01:45.082 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 01:56.607 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abfrmblr.java)| 21.0.1-open | [Abhilash](https://github.com/abfrmblr) |  |
 |   | 03:43.521 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yehwankim23.java)| 21.0.1-open | [김예환 Ye-Hwan Kim (Sam)](https://github.com/yehwankim23) |  |
 |   | 03:59.760 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fragmede.java)| 21.0.1-open | [Samson](https://github.com/fragmede) |  |

From 987da549061f03d33f3b161d85b4de815c256d8d Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Mon, 15 Jan 2024 19:57:34 +0100
Subject: [PATCH 023/268] branchy version (#408)

---
 .../CalculateAverage_artsiomkorzun.java       | 283 +++++++++++-------
 1 file changed, 176 insertions(+), 107 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index 4f6c8fd10..f92f41422 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -20,7 +20,6 @@
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.reflect.Field;
-import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
@@ -38,11 +37,10 @@ public class CalculateAverage_artsiomkorzun {
     private static final int SEGMENT_SIZE = 32 * 1024 * 1024;
     private static final int SEGMENT_COUNT = (int) ((MAPPED_FILE.byteSize() + SEGMENT_SIZE - 1) / SEGMENT_SIZE);
     private static final int SEGMENT_OVERLAP = 1024;
-    private static final long COMMA_PATTERN = pattern(';');
+    private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
 
-    private static final ByteOrder BYTE_ORDER = ByteOrder.nativeOrder();
     private static final Unsafe UNSAFE;
 
     static {
@@ -95,19 +93,15 @@ private static MemorySegment map(Path file) {
         }
     }
 
-    private static long pattern(char c) {
-        long b = c & 0xFFL;
-        return b | (b << 8) | (b << 16) | (b << 24) | (b << 32) | (b << 40) | (b << 48) | (b << 56);
-    }
-
-    private static long getLongLittleEndian(long address) {
-        long value = UNSAFE.getLong(address);
-
-        if (BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
-            value = Long.reverseBytes(value);
-        }
-
-        return value;
+    private static long word(long address) {
+        return UNSAFE.getLong(address);
+        /*
+         * if (BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+         * value = Long.reverseBytes(value);
+         * }
+         *
+         * return value;
+         */
     }
 
     private static String text(Map<String, Aggregate> aggregates) {
@@ -140,7 +134,7 @@ private record Aggregate(int min, int max, long sum, int cnt) {
     private static class Aggregates {
 
         private static final int ENTRIES = 64 * 1024;
-        private static final int SIZE = 32 * ENTRIES;
+        private static final int SIZE = 128 * ENTRIES;
 
         private final long pointer;
 
@@ -150,62 +144,82 @@ public Aggregates() {
             UNSAFE.setMemory(pointer, SIZE, (byte) 0);
         }
 
-        public void add(long reference, int length, int hash, int value) {
+        public long find(long word, int hash) {
+            long address = pointer + offset(hash);
+            long w = word(address + 24);
+            return (w == word) ? address : 0;
+        }
+
+        public long find(long word1, long word2, int hash) {
+            long address = pointer + offset(hash);
+            long w1 = word(address + 24);
+            long w2 = word(address + 32);
+            return (word1 == w1) && (word2 == w2) ? address : 0;
+        }
+
+        public long put(long reference, long word, int length, int hash) {
             for (int offset = offset(hash);; offset = next(offset)) {
                 long address = pointer + offset;
-                long ref = UNSAFE.getLong(address);
-
-                if (ref == 0) {
-                    alloc(reference, length, hash, value, address);
-                    break;
+                if (equal(reference, word, address + 24, length)) {
+                    return address;
                 }
 
-                if (equal(ref, reference, length)) {
-                    long sum = UNSAFE.getLong(address + 16) + value;
-                    int cnt = UNSAFE.getInt(address + 24) + 1;
-                    short min = (short) Math.min(UNSAFE.getShort(address + 28), value);
-                    short max = (short) Math.max(UNSAFE.getShort(address + 30), value);
-
-                    UNSAFE.putLong(address + 16, sum);
-                    UNSAFE.putInt(address + 24, cnt);
-                    UNSAFE.putShort(address + 28, min);
-                    UNSAFE.putShort(address + 30, max);
-                    break;
+                int len = UNSAFE.getInt(address);
+                if (len == 0) {
+                    alloc(reference, length, hash, address);
+                    return address;
                 }
             }
         }
 
+        public static void update(long address, int value) {
+            long sum = UNSAFE.getLong(address + 8) + value;
+            int cnt = UNSAFE.getInt(address + 16) + 1;
+            short min = UNSAFE.getShort(address + 20);
+            short max = UNSAFE.getShort(address + 22);
+
+            UNSAFE.putLong(address + 8, sum);
+            UNSAFE.putInt(address + 16, cnt);
+
+            if (value < min) {
+                UNSAFE.putShort(address + 20, (short) value);
+            }
+
+            if (value > max) {
+                UNSAFE.putShort(address + 22, (short) value);
+            }
+        }
+
         public void merge(Aggregates rights) {
-            for (int rightOffset = 0; rightOffset < SIZE; rightOffset += 32) {
+            for (int rightOffset = 0; rightOffset < SIZE; rightOffset += 128) {
                 long rightAddress = rights.pointer + rightOffset;
-                long reference = UNSAFE.getLong(rightAddress);
+                int length = UNSAFE.getInt(rightAddress);
 
-                if (reference == 0) {
+                if (length == 0) {
                     continue;
                 }
 
-                int hash = UNSAFE.getInt(rightAddress + 8);
-                int length = UNSAFE.getInt(rightAddress + 12);
+                int hash = UNSAFE.getInt(rightAddress + 4);
 
                 for (int offset = offset(hash);; offset = next(offset)) {
                     long address = pointer + offset;
-                    long ref = UNSAFE.getLong(address);
+                    int len = UNSAFE.getInt(address);
 
-                    if (ref == 0) {
-                        UNSAFE.copyMemory(rightAddress, address, 32);
+                    if (len == 0) {
+                        UNSAFE.copyMemory(rightAddress, address, 24 + length);
                         break;
                     }
 
-                    if (equal(ref, reference, length)) {
-                        long sum = UNSAFE.getLong(address + 16) + UNSAFE.getLong(rightAddress + 16);
-                        int cnt = UNSAFE.getInt(address + 24) + UNSAFE.getInt(rightAddress + 24);
-                        short min = (short) Math.min(UNSAFE.getShort(address + 28), UNSAFE.getShort(rightAddress + 28));
-                        short max = (short) Math.max(UNSAFE.getShort(address + 30), UNSAFE.getShort(rightAddress + 30));
+                    if (len == length && equal(address + 24, rightAddress + 24, length)) {
+                        long sum = UNSAFE.getLong(address + 8) + UNSAFE.getLong(rightAddress + 8);
+                        int cnt = UNSAFE.getInt(address + 16) + UNSAFE.getInt(rightAddress + 16);
+                        short min = (short) Math.min(UNSAFE.getShort(address + 20), UNSAFE.getShort(rightAddress + 20));
+                        short max = (short) Math.max(UNSAFE.getShort(address + 22), UNSAFE.getShort(rightAddress + 22));
 
-                        UNSAFE.putLong(address + 16, sum);
-                        UNSAFE.putInt(address + 24, cnt);
-                        UNSAFE.putShort(address + 28, min);
-                        UNSAFE.putShort(address + 30, max);
+                        UNSAFE.putLong(address + 8, sum);
+                        UNSAFE.putInt(address + 16, cnt);
+                        UNSAFE.putShort(address + 20, min);
+                        UNSAFE.putShort(address + 22, max);
                         break;
                     }
                 }
@@ -215,20 +229,19 @@ public void merge(Aggregates rights) {
         public Map<String, Aggregate> aggregate() {
             TreeMap<String, Aggregate> set = new TreeMap<>();
 
-            for (int offset = 0; offset < SIZE; offset += 32) {
+            for (int offset = 0; offset < SIZE; offset += 128) {
                 long address = pointer + offset;
-                long ref = UNSAFE.getLong(address);
+                int length = UNSAFE.getInt(address);
 
-                if (ref != 0) {
-                    int length = UNSAFE.getInt(address + 12) - 1;
+                if (length != 0) {
                     byte[] array = new byte[length];
-                    UNSAFE.copyMemory(null, ref, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, length);
+                    UNSAFE.copyMemory(null, address + 24, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, length);
                     String key = new String(array);
 
-                    long sum = UNSAFE.getLong(address + 16);
-                    int cnt = UNSAFE.getInt(address + 24);
-                    short min = UNSAFE.getShort(address + 28);
-                    short max = UNSAFE.getShort(address + 30);
+                    long sum = UNSAFE.getLong(address + 8);
+                    int cnt = UNSAFE.getInt(address + 16);
+                    short min = UNSAFE.getShort(address + 20);
+                    short max = UNSAFE.getShort(address + 22);
 
                     Aggregate aggregate = new Aggregate(min, max, sum, cnt);
                     set.put(key, aggregate);
@@ -238,26 +251,24 @@ public Map<String, Aggregate> aggregate() {
             return set;
         }
 
-        private static void alloc(long reference, int length, int hash, int value, long address) {
-            UNSAFE.putLong(address, reference);
-            UNSAFE.putInt(address + 8, hash);
-            UNSAFE.putInt(address + 12, length);
-            UNSAFE.putLong(address + 16, value);
-            UNSAFE.putInt(address + 24, 1);
-            UNSAFE.putShort(address + 28, (short) value);
-            UNSAFE.putShort(address + 30, (short) value);
+        private static void alloc(long reference, int length, int hash, long address) {
+            UNSAFE.putInt(address, length);
+            UNSAFE.putInt(address + 4, hash);
+            UNSAFE.putShort(address + 20, Short.MAX_VALUE);
+            UNSAFE.putShort(address + 22, Short.MIN_VALUE);
+            UNSAFE.copyMemory(reference, address + 24, length);
         }
 
         private static int offset(int hash) {
-            return ((hash) & (ENTRIES - 1)) << 5;
+            return ((hash) & (ENTRIES - 1)) << 7;
         }
 
         private static int next(int prev) {
-            return (prev + 32) & (SIZE - 1);
+            return (prev + 128) & (SIZE - 1);
         }
 
-        private static boolean equal(long leftAddress, long rightAddress, int length) {
-            while (length > 8) {
+        private static boolean equal(long leftAddress, long leftWord, long rightAddress, int length) {
+            while (length >= 8) {
                 long left = UNSAFE.getLong(leftAddress);
                 long right = UNSAFE.getLong(rightAddress);
 
@@ -270,10 +281,24 @@ private static boolean equal(long leftAddress, long rightAddress, int length) {
                 length -= 8;
             }
 
-            int shift = (8 - length) << 3;
-            long left = getLongLittleEndian(leftAddress) << shift;
-            long right = getLongLittleEndian(rightAddress) << shift;
-            return (left == right);
+            return leftWord == word(rightAddress);
+        }
+
+        private static boolean equal(long leftAddress, long rightAddress, int length) {
+            do {
+                long left = UNSAFE.getLong(leftAddress);
+                long right = UNSAFE.getLong(rightAddress);
+
+                if (left != right) {
+                    return false;
+                }
+
+                leftAddress += 8;
+                rightAddress += 8;
+                length -= 8;
+            } while (length > 0);
+
+            return true;
         }
     }
 
@@ -320,45 +345,89 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
             // as a result a read will be split across pages, where one of them is not mapped
             // but for some reason it works on my machine, leaving to investigate
 
-            for (long start = position, hash = 0; position <= limit;) {
-                int length; // idea: royvanrijn, explanation: https://richardstartin.github.io/posts/finding-bytes
-                {
-                    long word = getLongLittleEndian(position);
-                    long match = word ^ COMMA_PATTERN;
-                    long mask = (match - 0x0101010101010101L) & ~match & 0x8080808080808080L;
-
-                    if (mask == 0) {
-                        hash ^= word;
-                        position += 8;
-                        continue;
-                    }
+            while (position <= limit) { // branchy version, credit: thomaswue
+                int length;
+                int hash;
 
-                    int bit = Long.numberOfTrailingZeros(mask);
-                    position += (bit >>> 3) + 1; // +sep
-                    hash ^= (word << (69 - bit));
-                    length = (int) (position - start);
-                }
+                long ptr = 0;
+                long word = word(position);
+                long separator = separator(word);
 
-                int value; // idea: merykitty
-                {
-                    long word = getLongLittleEndian(position);
-                    long inverted = ~word;
-                    int dot = Long.numberOfTrailingZeros(inverted & DOT_BITS);
-                    long signed = (inverted << 59) >> 63;
-                    long mask = ~(signed & 0xFF);
-                    long digits = ((word & mask) << (28 - dot)) & 0x0F000F0F00L;
-                    long abs = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
-                    value = (int) ((abs ^ signed) - signed);
-                    position += (dot >> 3) + 3;
+                if (separator != 0) {
+                    length = length(separator);
+                    word = mask(word, separator);
+                    hash = mix(word);
+                    ptr = aggregates.find(word, hash);
+                }
+                else {
+                    long word0 = word;
+                    word = word(position + 8);
+                    separator = separator(word);
+
+                    if (separator != 0) {
+                        length = length(separator) + 8;
+                        word = mask(word, separator);
+                        hash = mix(word ^ word0);
+                        ptr = aggregates.find(word0, word, hash);
+                    }
+                    else {
+                        length = 16;
+                        long h = word ^ word0;
+
+                        while (true) {
+                            word = word(position + length);
+                            separator = separator(word);
+
+                            if (separator == 0) {
+                                length += 8;
+                                h ^= word;
+                                continue;
+                            }
+
+                            length += length(separator);
+                            word = mask(word, separator);
+                            hash = mix(h ^ word);
+                            break;
+                        }
+                    }
                 }
 
-                aggregates.add(start, length, mix(hash), value);
+                if (ptr == 0) {
+                    ptr = aggregates.put(position, word, length, hash);
+                }
 
-                start = position;
-                hash = 0;
+                position = update(ptr, position + length + 1);
             }
         }
 
+        private static long update(long ptr, long position) {
+            // idea: merykitty
+            long word = word(position);
+            long inverted = ~word;
+            int dot = Long.numberOfTrailingZeros(inverted & DOT_BITS);
+            long signed = (inverted << 59) >> 63;
+            long mask = ~(signed & 0xFF);
+            long digits = ((word & mask) << (28 - dot)) & 0x0F000F0F00L;
+            long abs = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
+            int value = (int) ((abs ^ signed) - signed);
+
+            Aggregates.update(ptr, value);
+            return position + (dot >> 3) + 3;
+        }
+
+        private static long separator(long word) {
+            long match = word ^ COMMA_PATTERN;
+            return (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
+        }
+
+        private static long mask(long word, long separator) {
+            return word & ((separator >>> 7) - 1) & 0x00FFFFFFFFFFFFFFL;
+        }
+
+        private static int length(long separator) {
+            return Long.numberOfTrailingZeros(separator) >>> 3;
+        }
+
         private static long next(long position) {
             while (UNSAFE.getByte(position++) != '\n') {
                 // continue

From 702d41df159c8f6acb17f17c99cbec52a466341e Mon Sep 17 00:00:00 2001
From: Arjen Wisse <arjenw@users.noreply.github.com>
Date: Mon, 15 Jan 2024 20:00:52 +0100
Subject: [PATCH 024/268] Small optimizations (#426)

---
 calculate_average_arjenw.sh                   |  2 +-
 .../onebrc/CalculateAverage_arjenw.java       | 47 ++++++++++---------
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/calculate_average_arjenw.sh b/calculate_average_arjenw.sh
index 73391a776..750ced9e0 100755
--- a/calculate_average_arjenw.sh
+++ b/calculate_average_arjenw.sh
@@ -17,4 +17,4 @@
 
 JAVA_OPTS="-Xms500m -Xmx500m --enable-preview -dsa -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:-AlwaysPreTouch"
 
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_arjenw
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_arjenw $@
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java b/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java
index 0f5f3fe68..9355d4729 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java
@@ -40,37 +40,40 @@
 // * memory-mapped file approach:           0m3.2s (also way simpler and neater code; inspired by spullara)
 // * smarter number parsing:                0m2.95s (inspired by iziamos)
 // * switching back to 21-tem vm            0m2.6s
+// * small optimizations                    0m2.5s (skip byte-array copy, optimal StationList array size avoiding collisions)
 
 public class CalculateAverage_arjenw {
-    private static final int TWO_BYTE_TO_INT = 480 + 48;
+    private static final int TWO_BYTE_TO_INT = 480 + 48; // 48 is the ASCII code for '0'
     private static final int THREE_BYTE_TO_INT = 4800 + 480 + 48;
     private static final String FILE = "./measurements.txt";
 
     public static void main(String[] args) {
-        var file = new File(FILE);
+        var file = new File(args.length > 0 ? args[0] : FILE);
         var fileSize = file.length();
         var numberOfProcessors = fileSize > 1_000_000 ? Runtime.getRuntime().availableProcessors() : 1;
-        var segmentSize = fileSize / numberOfProcessors;
-        var results = IntStream.range(0, numberOfProcessors)
+        var segmentSize = (int) Math.min(Integer.MAX_VALUE, fileSize / numberOfProcessors); // bytebuffer position is an int, so can be max Integer.MAX_VALUE
+        var segmentCount = (int) (fileSize / segmentSize);
+        var results = IntStream.range(0, segmentCount)
                 .mapToObj(segmentNr -> parseSegment(file, fileSize, segmentSize, segmentNr))
                 .parallel()
                 .reduce(StationList::merge)
                 .orElseGet(StationList::new)
                 .toStringArray();
-        Arrays.sort(results, Comparator.comparing(o -> take(o, '=')));
+        Arrays.sort(results, Comparator.comparing(o -> takeUntil(o, '=')));
         System.out.format("{%s}%n", String.join(", ", results));
     }
 
-    private static StationList parseSegment(File file, long fileSize, long segmentSize, int segmentNr) {
-        long segmentStart = segmentNr * segmentSize;
+    private static StationList parseSegment(File file, long fileSize, int segmentSize, int segmentNr) {
+        long segmentStart = segmentNr * (long) segmentSize;
         long segmentEnd = Math.min(fileSize, segmentStart + segmentSize + 100);
-        StationList stationList = new StationList();
         try (var fileChannel = (FileChannel) Files.newByteChannel(file.toPath(), StandardOpenOption.READ)) {
             var bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, segmentStart, segmentEnd - segmentStart);
             if (segmentStart > 0) {
+                // noinspection StatementWithEmptyBody
                 while (bb.get() != '\n')
                     ; // skip to first new line
             }
+            StationList stationList = new StationList();
             var buffer = new byte[100];
             while (bb.position() < segmentSize) {
                 byte b;
@@ -103,8 +106,10 @@ else if (b1 == '-') { // value is -n.n
                     bb.get(); // new line
                 }
 
-                stationList.add(buffer, i, Math.abs(hash), value);
+                if (stationList.add(buffer, i, Math.abs(hash), value))
+                    buffer = new byte[100]; // station was new, create new buffer to contain the next station's name
             }
+
             return stationList;
         }
         catch (IOException e) {
@@ -155,31 +160,29 @@ public void merge(Station other) {
     }
 
     private static class StationList implements Iterable<Station> {
-        private final static int MAX_ENTRY = 32767; // choose a value that is binary all 1's.
-        private final Station[] array = new Station[MAX_ENTRY + 1];
+        private final static int MAX_ENTRY = 65375; // choose a value that _eliminates_ collisions on the test set.
+        private final Station[] array = new Station[MAX_ENTRY];
         private int size = 0;
 
-        private void add(int hash, Supplier<Station> create, Consumer<Station> update) {
-            var position = hash & MAX_ENTRY;
+        private boolean add(int hash, Supplier<Station> create, Consumer<Station> update) {
+            var position = hash % MAX_ENTRY;
             Station existing;
             while ((existing = array[position]) != null && existing.hash != hash) {
-                position = (position + 1) & MAX_ENTRY;
+                position = (position + 1) % MAX_ENTRY;
             }
             if (existing == null) {
                 array[position] = create.get();
                 size++;
+                return true;
             }
             else {
                 update.accept(existing);
+                return false;
             }
         }
 
-        public void add(byte[] data, int stationNameLength, int stationHash, int value) {
-            add(stationHash, () -> {
-                var stationName = new byte[stationNameLength];
-                System.arraycopy(data, 0, stationName, 0, stationNameLength);
-                return new Station(stationName, stationNameLength, stationHash, value);
-            }, existing -> existing.append(value));
+        public boolean add(byte[] data, int stationNameLength, int stationHash, int value) {
+            return add(stationHash, () -> new Station(data, stationNameLength, stationHash, value), existing -> existing.append(value));
         }
 
         public void add(Station station) {
@@ -210,7 +213,7 @@ public Iterator<Station> iterator() {
                 @Override
                 public boolean hasNext() {
                     Station station = null;
-                    while (index <= MAX_ENTRY && (station = array[index]) == null)
+                    while (index < MAX_ENTRY && (station = array[index]) == null)
                         index++;
                     return station != null;
                 }
@@ -226,7 +229,7 @@ public Station next() {
         }
     }
 
-    private static String take(String s, char c) {
+    private static String takeUntil(String s, char c) {
         var pos = s.indexOf(c);
         return pos > -1 ? s.substring(0, pos) : s;
     }

From 6fe395cbaed2ed51fbdc1a16dead896b90ca75ec Mon Sep 17 00:00:00 2001
From: Vemana <subramisc@gmail.com>
Date: Tue, 16 Jan 2024 00:40:50 +0530
Subject: [PATCH 025/268] Squashing a bunch of commits together. (#428)

Commit#2; Uplift of 7% using native byteorder from ByteBuffer.
Commit#1: Minor changes to formatting.

Co-authored-by: vemana <vemana.github@gmail.com>
---
 .../onebrc/CalculateAverage_vemana.java       | 204 +++++++++---------
 1 file changed, 105 insertions(+), 99 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
index 7673fb573..d4f0a2fb8 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
@@ -41,55 +41,54 @@
  * remain readable for a majority of SWEs. At a high level, the approach relies on a few principles
  * listed herein.
  *
- * <p>
- * [Exploit Parallelism] Distribute the work into Shards. Separate threads (one per core) process
+ * <p>[Exploit Parallelism] Distribute the work into Shards. Separate threads (one per core) process
  * Shards and follow it up by merging the results. parallelStream() is appealing but carries
  * potential run-time variance (i.e. std. deviation) penalties based on informal testing. Variance
  * is not ideal when trying to minimize the maximum worker latency.
  *
- * <p>
- * [Use ByteBuffers over MemorySegment] Each Shard is further divided in Chunks. This would've been
- * unnecessary except that Shards are too big to be backed by ByteBuffers. Besides, MemorySegment
- * appears slower than ByteBuffers. So, to use ByteBuffers, we have to use smaller chunks.
+ * <p>[Use ByteBuffers over MemorySegment] Each Shard is further divided in Chunks. This would've
+ * been unnecessary except that Shards are too big to be backed by ByteBuffers. Besides,
+ * MemorySegment appears slower than ByteBuffers. So, to use ByteBuffers, we have to use smaller
+ * chunks.
  *
- * <p>
- * [Straggler freedom] The optimization function here is to minimize the maximal worker thread
+ * <p>[Straggler freedom] The optimization function here is to minimize the maximal worker thread
  * completion. Law of large number averages means that all the threads will end up with similar
  * amounts of work and similar completion times; but, however ever so often there could be a bad
  * sharding and more importantly, Cores are not created equal; some will be throttled more than
  * others. So, we have a shared {@code LazyShardQueue} that aims to distribute work to minimize the
  * latest completion time.
  *
- * <p>
- * [Work Assignment with LazyShardQueue] The queue provides each thread with its next big-chunk
+ * <p>[Work Assignment with LazyShardQueue] The queue provides each thread with its next big-chunk
  * until X% of the work remains. Big-chunks belong to the thread and will not be provided to another
- * thread.  Then, it switches to providing small-chunk sizes. Small-chunks comprise the last X% of
+ * thread. Then, it switches to providing small-chunk sizes. Small-chunks comprise the last X% of
  * work and every thread can participate in completing the chunk. Even though the queue is shared
  * across threads, there's no communication across thread during the big-chunk phases. The queue is
  * effectively a per-thread queue while processing big-chunks. The small-chunk phase uses an
  * AtomicLong to coordinate chunk allocation across threads.
  *
- * <p>
- * [Chunk processing] Chunk processing is typical. Process line by line. Find a hash function
+ * <p>[Chunk processing] Chunk processing is typical. Process line by line. Find a hash function
  * (polynomial hash fns are slow, but will work fine), hash the city name, resolve conflicts using
  * linear probing and then accumulate the temperature into the appropriate hash slot. The key
  * element then is how fast can you identify the hash slot, read the temperature and update the new
  * temperature in the slot (i.e. min, max, count).
  *
- * <p>
- * [Cache friendliness] 7502P and my machine (7950X) offer 4MB L3 cache/core. This means we can hope
- * to fit all our datastructures in L3 cache. Since SMT is turned on, the Runtime's available
+ * <p>[Cache friendliness] 7502P and my machine (7950X) offer 4MB L3 cache/core. This means we can
+ * hope to fit all our datastructures in L3 cache. Since SMT is turned on, the Runtime's available
  * processors will show twice the number of actual cores and so we get 2MB L3 cache/thread. To be
  * safe, we try to stay within 1.8 MB/thread and size our hashtable appropriately.
  *
- * <p>
- * [Allocation] Since MemorySegment seemed slower than ByteBuffers, backing Chunks by bytebuffers
+ * <p>[Native ByteOrder is MUCH better] There was almost a 10% lift by reading ints from bytebuffers
+ * using native byteorder . It so happens that both the eval machine (7502P) and my machine 7950X
+ * use native LITTLE_ENDIAN order, which again apparently is because X86[-64] is little-endian. But,
+ * by default, ByteBuffers use BIG_ENDIAN order, which appears to be a somewhat strange default from
+ * Java.
+ *
+ * <p>[Allocation] Since MemorySegment seemed slower than ByteBuffers, backing Chunks by bytebuffers
  * was the logical option. Creating one ByteBuffer per chunk was no bueno because the system doesn't
  * like it (JVM runs out of mapped file handle quota). Other than that, allocation in the hot path
  * was avoided.
  *
- * <p>
- * [General approach to fast hashing and temperature reading] Here, it helps to understand the
+ * <p>[General approach to fast hashing and temperature reading] Here, it helps to understand the
  * various bottlenecks in execution. One particular thing that I kept coming back to was to
  * understand the relative costs of instructions: See
  * https://www.agner.org/optimize/instruction_tables.pdf It is helpful to think of hardware as a
@@ -102,24 +101,22 @@
  * endPos" in a tight loop by breaking it into two pieces: one piece where the check will not be
  * needed and a tail piece where it will be needed.
  *
- * <p>
- * [Understand What Cores like]. Cores like to go straight and loop back. Despite good branch
+ * <p>[Understand What Cores like]. Cores like to go straight and loop back. Despite good branch
  * prediction, performance sucks with mispredicted branches.
  *
- * <p>
- * [JIT] Java performance requires understanding the JIT. It is helpful to understand what the JIT
- * likes though it is still somewhat of a mystery to me. In general, it inlines small methods very
- * well and after constant folding, it can optimize quite well across a reasonably deep call chain.
- * My experience with the JIT was that everything I tried to tune it made it worse except for one
- * parameter. I have a new-found respect for JIT - it likes and understands typical Java idioms.
+ * <p>[JIT] Java performance requires understanding the JIT. It is helpful to understand what the
+ * JIT likes though it is still somewhat of a mystery to me. In general, it inlines small methods
+ * very well and after constant folding, it can optimize quite well across a reasonably deep call
+ * chain. My experience with the JIT was that everything I tried to tune it made it worse except for
+ * one parameter. I have a new-found respect for JIT - it likes and understands typical Java idioms.
  *
- * <p>[Tuning] Nothing was more insightful than actually playing with various tuning parameters.
- * I can have all the theories but the hardware and JIT are giant blackboxes. I used a bunch of
- * tools to optimize: (1) Command line parameters to tune big and small chunk sizes etc. This was
- * also very helpful in forming a mental model of the JIT. Sometimes, it would compile some methods
- * and sometimes it would just run them interpreted since the compilation threshold wouldn't be
- * reached for intermediate methods. (2) AsyncProfiler - this was the first line tool to understand
- * cache misses and cpu time to figure where to aim the next optimization effort. (3) JitWatch -
+ * <p>[Tuning] Nothing was more insightful than actually playing with various tuning parameters. I
+ * can have all the theories but the hardware and JIT are giant blackboxes. I used a bunch of tools
+ * to optimize: (1) Command line parameters to tune big and small chunk sizes etc. This was also
+ * very helpful in forming a mental model of the JIT. Sometimes, it would compile some methods and
+ * sometimes it would just run them interpreted since the compilation threshold wouldn't be reached
+ * for intermediate methods. (2) AsyncProfiler - this was the first line tool to understand cache
+ * misses and cpu time to figure where to aim the next optimization effort. (3) JitWatch -
  * invaluable for forming a mental model and attempting to tune the JIT.
  *
  * <p>[Things that didn't work]. This is a looong list and the hit rate is quite low. In general,
@@ -140,12 +137,6 @@
  */
 public class CalculateAverage_vemana {
 
-    public static void checkArg(boolean condition) {
-        if (!condition) {
-            throw new IllegalArgumentException();
-        }
-    }
-
     public static void main(String[] args) throws Exception {
         // First process in large chunks without coordination among threads
         // Use chunkSizeBits for the large-chunk size
@@ -184,18 +175,26 @@ public static void main(String[] args) throws Exception {
         // - hashtableSizeBits = \{hashtableSizeBits}
         // """);
 
-        System.out.println(new Runner(
-                Path.of("measurements.txt"),
-                chunkSizeBits,
-                commonChunkFraction,
-                commonChunkSizeBits,
-                hashtableSizeBits).getSummaryStatistics());
+        System.out.println(
+                new Runner(
+                        Path.of("measurements.txt"),
+                        chunkSizeBits,
+                        commonChunkFraction,
+                        commonChunkSizeBits,
+                        hashtableSizeBits)
+                                .getSummaryStatistics());
     }
 
-    public interface LazyShardQueue {
+  public record AggregateResult(Map<String, Stat> tempStats) {
 
-        ByteRange take(int shardIdx);
+    @Override
+    public String toString() {
+      return this.tempStats().entrySet().stream()
+          .sorted(Entry.comparingByKey())
+          .map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
+          .collect(Collectors.joining(", ", "{", "}"));
     }
+  }
 
     // Mutable to avoid allocation
     public static class ByteRange {
@@ -267,11 +266,11 @@ public void setRange(long rangeStart, long rangeEnd) {
     @Override
     public String toString() {
       return STR."""
-          ByteRange {
-            startInBuf = \{startInBuf}
-            endInBuf = \{endInBuf}
-          }
-          """;
+        ByteRange {
+          startInBuf = \{startInBuf}
+          endInBuf = \{endInBuf}
+        }
+        """;
     }
 
         private long nextNewLine(long pos) {
@@ -285,6 +284,7 @@ private long nextNewLine(long pos) {
         private void setByteBufferToRange(long start, long end) {
             try {
                 byteBuffer = raf.getChannel().map(MapMode.READ_ONLY, start, end - start);
+                byteBuffer.order(ByteOrder.nativeOrder());
             }
             catch (IOException e) {
                 throw new RuntimeException(e);
@@ -292,18 +292,22 @@ private void setByteBufferToRange(long start, long end) {
         }
     }
 
-  public record Result(Map<String, Stat> tempStats) {
+    public static final class Checks {
 
-    @Override
-    public String toString() {
-      return this.tempStats()
-                 .entrySet()
-                 .stream()
-                 .sorted(Entry.comparingByKey())
-                 .map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
-                 .collect(Collectors.joining(", ", "{", "}"));
+        public static void checkArg(boolean condition) {
+            if (!condition) {
+                throw new IllegalArgumentException();
+            }
+        }
+
+        private Checks() {
+        }
+    }
+
+    public interface LazyShardQueue {
+
+        ByteRange take(int shardIdx);
     }
-  }
 
     public static class Runner {
 
@@ -314,7 +318,10 @@ public static class Runner {
         private final int shardSizeBits;
 
         public Runner(
-                      Path inputFile, int chunkSizeBits, double commonChunkFraction, int commonChunkSizeBits,
+                      Path inputFile,
+                      int chunkSizeBits,
+                      double commonChunkFraction,
+                      int commonChunkSizeBits,
                       int hashtableSizeBits) {
             this.inputFile = inputFile;
             this.shardSizeBits = chunkSizeBits;
@@ -323,16 +330,12 @@ public Runner(
             this.hashtableSizeBits = hashtableSizeBits;
         }
 
-        Result getSummaryStatistics() throws Exception {
+        AggregateResult getSummaryStatistics() throws Exception {
             int processors = Runtime.getRuntime().availableProcessors();
             LazyShardQueue shardQueue = new SerialLazyShardQueue(
-                    1L << shardSizeBits,
-                    inputFile,
-                    processors,
-                    commonChunkFraction,
-                    commonChunkSizeBits);
+                    1L << shardSizeBits, inputFile, processors, commonChunkFraction, commonChunkSizeBits);
 
-            List<Future<Result>> results = new ArrayList<>();
+            List<Future<AggregateResult>> results = new ArrayList<>();
             ExecutorService executorService = Executors.newFixedThreadPool(
                     processors,
                     runnable -> {
@@ -345,8 +348,8 @@ Result getSummaryStatistics() throws Exception {
 
             for (int i = 0; i < processors; i++) {
                 final int I = i;
-                final Callable<Result> callable = () -> {
-                    Result result = new ShardProcessor(shardQueue, hashtableSizeBits, I).processShard();
+                final Callable<AggregateResult> callable = () -> {
+                    AggregateResult result = new ShardProcessor(shardQueue, hashtableSizeBits, I).processShard();
                     finishTimes[I] = System.nanoTime();
                     return result;
                 };
@@ -356,7 +359,7 @@ Result getSummaryStatistics() throws Exception {
             return executorService.submit(() -> merge(results)).get();
         }
 
-        private Result merge(List<Future<Result>> results)
+        private AggregateResult merge(List<Future<AggregateResult>> results)
                 throws ExecutionException, InterruptedException {
             Map<String, Stat> output = null;
             boolean[] isDone = new boolean[results.size()];
@@ -374,20 +377,20 @@ private Result merge(List<Future<Result>> results)
                             for (Entry<String, Stat> entry : results.get(i).get().tempStats().entrySet()) {
                                 output.compute(
                                         entry.getKey(),
-                                        (key, value) -> value == null ? entry.getValue()
-                                                : Stat.merge(value, entry.getValue()));
+                                        (key, value) -> value == null ? entry.getValue() : Stat.merge(value, entry.getValue()));
                             }
                         }
                     }
                 }
             }
-            return new Result(output);
+            return new AggregateResult(output);
         }
 
     private void printFinishTimes(long[] finishTimes) {
       Arrays.sort(finishTimes);
       int n = finishTimes.length;
-      System.err.println(STR."Finish Delta: \{(finishTimes[n - 1] - finishTimes[0]) / 1_000_000}ms");
+      System.err.println(
+          STR."Finish Delta: \{(finishTimes[n - 1] - finishTimes[0]) / 1_000_000}ms");
     }
     }
 
@@ -405,23 +408,29 @@ private static long roundToNearestHigherMultipleOf(long divisor, long value) {
         private final long[] nextStarts;
 
         public SerialLazyShardQueue(
-                                    long chunkSize, Path filePath, int shards, double commonChunkFraction,
+                                    long chunkSize,
+                                    Path filePath,
+                                    int shards,
+                                    double commonChunkFraction,
                                     int commonChunkSizeBits)
                 throws IOException {
-            checkArg(commonChunkFraction < 0.9 && commonChunkFraction >= 0);
+            Checks.checkArg(commonChunkFraction < 0.9 && commonChunkFraction >= 0);
             var raf = new RandomAccessFile(filePath.toFile(), "r");
             this.fileSize = raf.length();
 
             // Common pool
             long commonPoolStart = Math.min(
-                    roundToNearestHigherMultipleOf(chunkSize, (long) (fileSize * (1 - commonChunkFraction))),
+                    roundToNearestHigherMultipleOf(
+                            chunkSize, (long) (fileSize * (1 - commonChunkFraction))),
                     fileSize);
             this.commonPool = new AtomicLong(commonPoolStart);
             this.commonChunkSize = 1L << commonChunkSizeBits;
 
             // Distribute chunks to shards
             this.nextStarts = new long[shards << 4]; // thread idx -> 16*idx to avoid cache line conflict
-            for (long i = 0, currentStart = 0, remainingChunks = (commonPoolStart + chunkSize - 1) / chunkSize; i < shards; i++) {
+            for (long i = 0,
+                    currentStart = 0,
+                    remainingChunks = (commonPoolStart + chunkSize - 1) / chunkSize; i < shards; i++) {
                 long remainingShards = shards - i;
                 long currentChunks = (remainingChunks + remainingShards - 1) / remainingShards;
                 // Shard i handles: [currentStart, currentStart + currentChunks * chunkSize)
@@ -479,7 +488,7 @@ public ShardProcessor(LazyShardQueue shardQueue, int hashtableSizeBits, int thre
             this.state = new ShardProcessorState(hashtableSizeBits);
         }
 
-        public Result processShard() {
+        public AggregateResult processShard() {
             ByteRange range;
             while ((range = shardQueue.take(threadIdx)) != null) {
                 processRange(range);
@@ -497,7 +506,7 @@ private void processRange(ByteRange range) {
             }
         }
 
-        private Result result() {
+        private AggregateResult result() {
             return state.result();
         }
     }
@@ -527,30 +536,30 @@ public int processLine(MappedByteBuffer mmb, int nextPos) {
                     x = Integer.reverseBytes(x);
                 }
 
-                byte a = (byte) (x >>> 24);
+                byte a = (byte) (x >>> 0);
                 if (a == ';') {
                     nextPos += 1;
                     break;
                 }
 
-                byte b = (byte) (x >>> 16);
+                byte b = (byte) (x >>> 8);
                 if (b == ';') {
                     nextPos += 2;
-                    hash = hash * 31 + ((0xFF000000 & x));
+                    hash = hash * 31 + ((0xFF & x));
                     break;
                 }
 
-                byte c = (byte) (x >>> 8);
+                byte c = (byte) (x >>> 16);
                 if (c == ';') {
                     nextPos += 3;
-                    hash = hash * 31 + ((0xFFFF0000 & x));
+                    hash = hash * 31 + ((0xFFFF & x));
                     break;
                 }
 
-                byte d = (byte) (x >>> 0);
+                byte d = (byte) (x >>> 24);
                 if (d == ';') {
                     nextPos += 4;
-                    hash = hash * 31 + ((0xFFFFFF00 & x));
+                    hash = hash * 31 + ((0xFFFFFF & x));
                     break;
                 }
 
@@ -582,16 +591,12 @@ public int processLine(MappedByteBuffer mmb, int nextPos) {
             }
 
             linearProbe(
-                    cityLen,
-                    hash & slotsMask,
-                    negative ? -temperature : temperature,
-                    mmb,
-                    originalPos);
+                    cityLen, hash & slotsMask, negative ? -temperature : temperature, mmb, originalPos);
 
             return nextPos;
         }
 
-        public Result result() {
+        public AggregateResult result() {
             int N = stats.length;
             TreeMap<String, Stat> map = new TreeMap<>();
             for (int i = 0; i < N; i++) {
@@ -599,7 +604,7 @@ public Result result() {
                     map.put(new String(cityNames[i]), stats[i]);
                 }
             }
-            return new Result(map);
+            return new AggregateResult(map);
         }
 
         private byte[] copyFrom(MappedByteBuffer mmb, int offsetInMmb, int len) {
@@ -642,6 +647,7 @@ private void linearProbe(int len, int hash, int temp, MappedByteBuffer mmb, int
         }
     }
 
+    /** Represents aggregate stats. */
     public static class Stat {
 
         public static Stat firstReading(int temp) {

From dd9a3dde7e692198cdd58570fc2bd1822d2ca237 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 15 Jan 2024 20:12:31 +0100
Subject: [PATCH 026/268] Update pull_request_template.md

---
 .github/pull_request_template.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 6e4a38a52..6f71c4517 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -3,6 +3,8 @@
 - [ ] All formatting changes by the build are committed
 - [ ] Your launch script is named `calculate_average_<username>.sh` (make sure to match casing of your GH user name) and is executable
 - [ ] Output matches that of `calculate_average_baseline.sh`
+- [ ] For new entries, or after substantial changes: When implementing custom hash structures, please point to where you deal with hash collisions (line number)
+
 * Execution time:
 * Execution time of reference implementation:
 

From d18b10708b632e42822af41342271f16eff7073a Mon Sep 17 00:00:00 2001
From: zerninv <zerninvasilii@yandex.ru>
Date: Mon, 15 Jan 2024 19:25:52 +0000
Subject: [PATCH 027/268] Sixth attempt CalculateAverage_zerninv.java (#407)

* rethink chunking

* fix typo
---
 .../onebrc/CalculateAverage_zerninv.java      | 206 ++++++++++--------
 1 file changed, 118 insertions(+), 88 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
index 789db7398..2e7ea4c1e 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
@@ -25,14 +25,15 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.*;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.TreeMap;
 
 public class CalculateAverage_zerninv {
     private static final String FILE = "./measurements.txt";
-    private static final int MIN_FILE_SIZE = 1024 * 1024 * 16;
+    private static final int L3_CACHE_SIZE = 128 * 1024 * 1024;
+    private static final int CORES = Runtime.getRuntime().availableProcessors();
+    private static final int CHUNK_SIZE = (L3_CACHE_SIZE - MeasurementContainer.SIZE * MeasurementContainer.ENTRY_SIZE * CORES) / CORES - 1024 * CORES;
 
     // #.##
     private static final int THREE_DIGITS_MASK = 0x2e0000;
@@ -48,47 +49,48 @@ public class CalculateAverage_zerninv {
 
     private static final Unsafe UNSAFE = initUnsafe();
 
-    public static void main(String[] args) throws IOException {
-        var results = new HashMap<String, MeasurementAggregation>();
+    public static void main(String[] args) throws IOException, InterruptedException {
         try (var channel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
             var fileSize = channel.size();
+            var minChunkSize = Math.min(fileSize, CHUNK_SIZE);
+
+            var tasks = new TaskThread[CORES];
+            for (int i = 0; i < tasks.length; i++) {
+                tasks[i] = new TaskThread(new MeasurementContainer(), (int) (fileSize / minChunkSize / CORES + 1));
+            }
+
             var memorySegment = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
-            long address = memorySegment.address();
-            var cores = Runtime.getRuntime().availableProcessors();
-            var minChunkSize = fileSize < MIN_FILE_SIZE ? fileSize : fileSize / cores;
+            var address = memorySegment.address();
             var chunks = splitByChunks(address, address + fileSize, minChunkSize);
+            for (int i = 0; i < chunks.size() - 1; i++) {
+                var task = tasks[i % CORES];
+                task.addChunk(chunks.get(i), chunks.get(i + 1));
+            }
 
-            var executor = Executors.newFixedThreadPool(cores);
-            List<Future<Map<String, MeasurementAggregation>>> fResults = new ArrayList<>();
-            for (int i = 1; i < chunks.size(); i++) {
-                final long prev = chunks.get(i - 1);
-                final long curr = chunks.get(i);
-                fResults.add(executor.submit(() -> calcForChunk(prev, curr)));
+            for (var task : tasks) {
+                task.start();
             }
 
-            fResults.forEach(f -> {
-                try {
-                    f.get().forEach((key, value) -> {
-                        var result = results.get(key);
-                        if (result != null) {
-                            result.merge(value);
-                        }
-                        else {
-                            results.put(key, value);
-                        }
-                    });
-                }
-                catch (InterruptedException | ExecutionException e) {
-                    e.printStackTrace();
-                }
-            });
-            executor.shutdown();
-        }
+            var results = new TreeMap<String, TemperatureAggregation>();
+            for (var task : tasks) {
+                task.join();
+                task.measurements()
+                        .forEach(measurement -> {
+                            var aggr = results.get(measurement.station());
+                            if (aggr == null) {
+                                results.put(measurement.station(), measurement.aggregation());
+                            }
+                            else {
+                                aggr.merge(measurement.aggregation());
+                            }
+                        });
+            }
 
-        var bos = new BufferedOutputStream(System.out);
-        bos.write(new TreeMap<>(results).toString().getBytes(StandardCharsets.UTF_8));
-        bos.write('\n');
-        bos.flush();
+            var bos = new BufferedOutputStream(System.out);
+            bos.write(new TreeMap<>(results).toString().getBytes(StandardCharsets.UTF_8));
+            bos.write('\n');
+            bos.flush();
+        }
     }
 
     private static Unsafe initUnsafe() {
@@ -103,7 +105,7 @@ private static Unsafe initUnsafe() {
     }
 
     private static List<Long> splitByChunks(long address, long end, long minChunkSize) {
-        List<Long> result = new ArrayList<>();
+        List<Long> result = new ArrayList<>((int) ((end - address) / minChunkSize + 1));
         result.add(address);
         while (address < end) {
             address += Math.min(end - address, minChunkSize);
@@ -114,60 +116,20 @@ private static List<Long> splitByChunks(long address, long end, long minChunkSiz
         return result;
     }
 
-    private static Map<String, MeasurementAggregation> calcForChunk(long offset, long end) {
-        var results = new MeasurementContainer();
-
-        long cityOffset;
-        int hashCode, temperature, word;
-        byte cityNameSize, b;
-
-        while (offset < end) {
-            cityOffset = offset;
-            hashCode = 0;
-            while ((b = UNSAFE.getByte(offset++)) != DELIMITER) {
-                hashCode = hashCode * 31 + b;
-            }
-            cityNameSize = (byte) (offset - cityOffset - 1);
-
-            word = UNSAFE.getInt(offset);
-            offset += 4;
-
-            if ((word & TWO_NEGATIVE_DIGITS_MASK) == TWO_NEGATIVE_DIGITS_MASK) {
-                word >>>= 8;
-                temperature = ZERO * 11 - ((word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK));
-            }
-            else if ((word & THREE_DIGITS_MASK) == THREE_DIGITS_MASK) {
-                temperature = (word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK) - ZERO * 111;
-            }
-            else if ((word & TWO_DIGITS_MASK) == TWO_DIGITS_MASK) {
-                temperature = (word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK) - ZERO * 11;
-                offset--;
-            }
-            else {
-                // #.##-
-                word = (word >>> 8) | (UNSAFE.getByte(offset++) << 24);
-                temperature = ZERO * 111 - ((word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK));
-            }
-            offset++;
-            results.put(cityOffset, cityNameSize, hashCode, (short) temperature);
-        }
-        return results.toStringMap();
-    }
-
-    private static final class MeasurementAggregation {
+    private static final class TemperatureAggregation {
         private long sum;
         private int count;
         private short min;
         private short max;
 
-        public MeasurementAggregation(long sum, int count, short min, short max) {
+        public TemperatureAggregation(long sum, int count, short min, short max) {
             this.sum = sum;
             this.count = count;
             this.min = min;
             this.max = max;
         }
 
-        public void merge(MeasurementAggregation o) {
+        public void merge(TemperatureAggregation o) {
             if (o == null) {
                 return;
             }
@@ -183,6 +145,9 @@ public String toString() {
         }
     }
 
+    private record Measurement(String station, TemperatureAggregation aggregation) {
+    }
+
     private static final class MeasurementContainer {
         private static final int SIZE = 1024 * 16;
 
@@ -235,26 +200,26 @@ && isEqual(UNSAFE.getLong(ptr + ADDRESS_OFFSET), address, size)) {
             }
         }
 
-        public Map<String, MeasurementAggregation> toStringMap() {
-            var result = new HashMap<String, MeasurementAggregation>();
+        public List<Measurement> measurements() {
+            var result = new ArrayList<Measurement>(1000);
             int count;
             for (int i = 0; i < SIZE; i++) {
                 long ptr = this.address + i * ENTRY_SIZE;
                 count = UNSAFE.getInt(ptr + COUNT_OFFSET);
                 if (count != 0) {
-                    var measurements = new MeasurementAggregation(
+                    var measurements = new TemperatureAggregation(
                             UNSAFE.getLong(ptr + SUM_OFFSET),
                             count,
                             UNSAFE.getShort(ptr + MIN_OFFSET),
                             UNSAFE.getShort(ptr + MAX_OFFSET));
                     var key = createString(UNSAFE.getLong(ptr + ADDRESS_OFFSET), UNSAFE.getByte(ptr + SIZE_OFFSET));
-                    result.put(key, measurements);
+                    result.add(new Measurement(key, measurements));
                 }
             }
             return result;
         }
 
-        private boolean isEqual(long address, long address2, byte size) {
+        private static boolean isEqual(long address, long address2, byte size) {
             for (int i = 0; i < size; i++) {
                 if (UNSAFE.getByte(address + i) != UNSAFE.getByte(address2 + i)) {
                     return false;
@@ -271,4 +236,69 @@ private String createString(long address, byte size) {
             return new String(arr);
         }
     }
-}
\ No newline at end of file
+
+    private static class TaskThread extends Thread {
+        private final MeasurementContainer container;
+        private final List<Long> begins;
+        private final List<Long> ends;
+
+        private TaskThread(MeasurementContainer container, int chunks) {
+            this.container = container;
+            this.begins = new ArrayList<>(chunks);
+            this.ends = new ArrayList<>(chunks);
+        }
+
+        public void addChunk(long begin, long end) {
+            begins.add(begin);
+            ends.add(end);
+        }
+
+        @Override
+        public void run() {
+            for (int i = 0; i < begins.size(); i++) {
+                calcForChunk(begins.get(i), ends.get(i));
+            }
+        }
+
+        public List<Measurement> measurements() {
+            return container.measurements();
+        }
+
+        private void calcForChunk(long offset, long end) {
+            long cityOffset;
+            int hashCode, temperature, word;
+            byte cityNameSize, b;
+
+            while (offset < end) {
+                cityOffset = offset;
+                hashCode = 0;
+                while ((b = UNSAFE.getByte(offset++)) != DELIMITER) {
+                    hashCode = hashCode * 31 + b;
+                }
+                cityNameSize = (byte) (offset - cityOffset - 1);
+
+                word = UNSAFE.getInt(offset);
+                offset += 4;
+
+                if ((word & TWO_NEGATIVE_DIGITS_MASK) == TWO_NEGATIVE_DIGITS_MASK) {
+                    word >>>= 8;
+                    temperature = ZERO * 11 - ((word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK));
+                }
+                else if ((word & THREE_DIGITS_MASK) == THREE_DIGITS_MASK) {
+                    temperature = (word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK) - ZERO * 111;
+                }
+                else if ((word & TWO_DIGITS_MASK) == TWO_DIGITS_MASK) {
+                    temperature = (word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK) - ZERO * 11;
+                    offset--;
+                }
+                else {
+                    // #.##-
+                    word = (word >>> 8) | (UNSAFE.getByte(offset++) << 24);
+                    temperature = ZERO * 111 - ((word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK));
+                }
+                offset++;
+                container.put(cityOffset, cityNameSize, hashCode, (short) temperature);
+            }
+        }
+    }
+}

From c926aab44455fe669aa192facb801862bb7d2d55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Pietrzyk?=
 <plbpietrz@users.noreply.github.com>
Date: Mon, 15 Jan 2024 20:30:04 +0100
Subject: [PATCH 028/268] Initial 1brc version by plbpietrz (#219)

* Initial version

* Small result merge optimisation

* Switched from reading bytes to longs

* Reading into internal buffer, test fixes

* Licence and minor string creation optimisation

* Hash collision fix
---
 calculate_average_plbpietrz.sh                |  20 ++
 .../onebrc/CalculateAverage_plbpietrz.java    | 273 ++++++++++++++++++
 2 files changed, 293 insertions(+)
 create mode 100755 calculate_average_plbpietrz.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java

diff --git a/calculate_average_plbpietrz.sh b/calculate_average_plbpietrz.sh
new file mode 100755
index 000000000..bcd76ad61
--- /dev/null
+++ b/calculate_average_plbpietrz.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS=""
+java $JAVA_OPTS -Xmx99m --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_plbpietrz
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java b/src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java
new file mode 100644
index 000000000..9fb382582
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java
@@ -0,0 +1,273 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+import java.io.UncheckedIOException;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class CalculateAverage_plbpietrz {
+
+    private static final String FILE = "./measurements.txt";
+    private static final int READ_SIZE = 1024;
+    private static final int CPU_COUNT = Runtime.getRuntime().availableProcessors();
+
+    private static class TemperatureStats {
+        double min = 999, max = -999d;
+        double accumulated;
+        int count;
+
+        public void update(double temp) {
+            this.min = Math.min(this.min, temp);
+            this.max = Math.max(this.max, temp);
+            this.accumulated += temp;
+            this.count++;
+        }
+    }
+
+    private record FilePart(long pos, long size) {
+    }
+
+    private static class WeatherStation {
+        private int length;
+        private int nameHash;
+        private byte[] nameBytes;
+        private String string;
+
+        public WeatherStation() {
+            nameBytes = new byte[128];
+        }
+
+        public WeatherStation(WeatherStation station) {
+            this.nameBytes = Arrays.copyOf(station.nameBytes, station.length);
+            this.length = station.length;
+            this.nameHash = station.nameHash;
+        }
+
+        @Override
+        public int hashCode() {
+            return nameHash;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o)
+                return true;
+            if (o instanceof WeatherStation s) {
+                return this.nameHash == s.nameHash && Arrays.equals(this.nameBytes, 0, this.length, s.nameBytes, 0, s.length);
+            }
+            return false;
+        }
+
+        @Override
+        public String toString() {
+            if (string == null)
+                string = new String(nameBytes, 0, length, Charset.defaultCharset());
+            return string;
+        }
+
+        public void appendByte(byte b) {
+            string = null;
+            nameBytes[length++] = b;
+            nameHash = nameHash * 31 + b;
+        }
+
+        public void clear() {
+            this.length = 0;
+            this.nameHash = 0;
+            this.string = null;
+        }
+
+    }
+
+    public static void main(String[] args) throws IOException {
+        Path inputFilePath = Path.of(FILE);
+        Map<WeatherStation, TemperatureStats> results;
+        try (RandomAccessFile inputFile = new RandomAccessFile(inputFilePath.toFile(), "r")) {
+            var parsedBuffers = partitionInput(inputFile)
+                    .stream()
+                    .parallel()
+                    .map(fp -> getMappedByteBuffer(fp, inputFile))
+                    .map(CalculateAverage_plbpietrz::parseBuffer);
+            results = parsedBuffers.flatMap(m -> m.entrySet().stream())
+                    .collect(
+                            Collectors.groupingBy(
+                                    Map.Entry::getKey,
+                                    Collectors.reducing(
+                                            new TemperatureStats(),
+                                            Map.Entry::getValue,
+                                            CalculateAverage_plbpietrz::mergeTemperatureStats)));
+            try (PrintWriter pw = new PrintWriter(new BufferedOutputStream(System.out))) {
+                formatResults(pw, results);
+            }
+        }
+    }
+
+    private static List<FilePart> partitionInput(RandomAccessFile inputFile) throws IOException {
+        List<FilePart> fileParts = new ArrayList<>();
+        long fileLength = inputFile.length();
+
+        long blockSize = Math.min(fileLength, Math.max(READ_SIZE, fileLength / CPU_COUNT));
+
+        for (long start = 0, end; start < fileLength; start = end) {
+            end = findMinBlockOffset(inputFile, start, blockSize);
+            fileParts.add(new FilePart(start, end - start));
+        }
+        return fileParts;
+    }
+
+    private static long findMinBlockOffset(RandomAccessFile file, long startPosition, long minBlockSize) throws IOException {
+        long length = file.length();
+        if (startPosition + minBlockSize < length) {
+            file.seek(startPosition + minBlockSize);
+            while (file.readByte() != '\n') {
+            }
+            return file.getFilePointer();
+        }
+        else {
+            return length;
+        }
+    }
+
+    private static MappedByteBuffer getMappedByteBuffer(FilePart fp, RandomAccessFile inputFile) {
+        try {
+            return inputFile.getChannel().map(FileChannel.MapMode.READ_ONLY, fp.pos, fp.size);
+        }
+        catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    private static Map<WeatherStation, TemperatureStats> parseBuffer(MappedByteBuffer buffer) {
+        byte[] readLong = new byte[READ_SIZE];
+        byte[] temperature = new byte[32];
+        int temperatureLineLenght = 0;
+
+        int limit = buffer.limit();
+        boolean readingName = true;
+        Map<WeatherStation, TemperatureStats> temperatures = new HashMap<>();
+        WeatherStation station = new WeatherStation();
+
+        int bytesToRead = Math.min(READ_SIZE, limit - buffer.position());
+        while (bytesToRead > 0) {
+            if (bytesToRead == READ_SIZE) {
+                buffer.get(readLong);
+            }
+            else {
+                for (int j = 0; j < bytesToRead; ++j)
+                    readLong[j] = buffer.get();
+            }
+
+            for (int i = 0; i < bytesToRead; ++i) {
+                byte aChar = readLong[i];
+                if (readingName) {
+                    if (aChar != ';') {
+                        if (aChar != '\n') {
+                            station.appendByte(aChar);
+                        }
+                    }
+                    else {
+                        readingName = false;
+                    }
+                }
+                else {
+                    if (aChar != '\n') {
+                        temperature[temperatureLineLenght++] = aChar;
+                    }
+                    else {
+                        double temp = parseTemperature(temperature, temperatureLineLenght);
+
+                        if (!temperatures.containsKey(station)) {
+                            temperatures.put(new WeatherStation(station), new TemperatureStats());
+                        }
+                        TemperatureStats weatherStats = temperatures.get(station);
+                        weatherStats.update(temp);
+
+                        station.clear();
+                        temperatureLineLenght = 0;
+                        readingName = true;
+                    }
+                }
+            }
+
+            bytesToRead = Math.min(READ_SIZE, limit - buffer.position());
+        }
+        return temperatures;
+    }
+
+    private static double parseTemperature(byte[] temperature, int temperatureSize) {
+        double sign = 1;
+        double manitssa = 0;
+        double exponent = 1;
+        for (int i = 0; i < temperatureSize; ++i) {
+            byte c = temperature[i];
+            switch (c) {
+                case '-':
+                    sign = -1;
+                    break;
+                case '.':
+                    for (int j = i; j < temperatureSize - 1; ++j)
+                        exponent *= 0.1;
+                    break;
+                default:
+                    manitssa = manitssa * 10 + (c - 48);
+            }
+        }
+        return sign * manitssa * exponent;
+    }
+
+    private static TemperatureStats mergeTemperatureStats(TemperatureStats v1, TemperatureStats v2) {
+        TemperatureStats acc = new TemperatureStats();
+        acc.min = Math.min(v1.min, v2.min);
+        acc.max = Math.max(v1.max, v2.max);
+        acc.accumulated = v1.accumulated + v2.accumulated;
+        acc.count = v1.count + v2.count;
+        return acc;
+    }
+
+    private static void formatResults(PrintWriter pw, Map<WeatherStation, TemperatureStats> resultsMap) {
+        pw.print('{');
+        var results = new ArrayList<>(resultsMap.entrySet());
+        results.sort(Comparator.comparing(e -> e.getKey().toString()));
+        var iterator = results.iterator();
+        while (iterator.hasNext()) {
+            var entry = iterator.next();
+            TemperatureStats stats = entry.getValue();
+            pw.printf("%s=%.1f/%.1f/%.1f",
+                    entry.getKey(),
+                    stats.min,
+                    stats.accumulated / stats.count,
+                    stats.max);
+            if ((iterator.hasNext()))
+                pw.print(", ");
+        }
+        pw.println('}');
+    }
+
+}

From be179dcf07833c1163e9c6f887dabb7084e2d016 Mon Sep 17 00:00:00 2001
From: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
Date: Mon, 15 Jan 2024 20:43:12 +0100
Subject: [PATCH 029/268] Improve scheduling for thomaswue (#358)

* Improve scheduling for another 6%.

* Tune hash function and collision handling.
---
 .../onebrc/CalculateAverage_thomaswue.java    | 51 +++++++++++++------
 1 file changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
index 10e92fc6d..041c17ca9 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
@@ -32,10 +32,10 @@
  * Simple solution that memory maps the input file, then splits it into one segment per available core and uses
  * sun.misc.Unsafe to directly access the mapped memory. Uses a long at a time when checking for collision.
  * <p>
- * Runs in 0.66s on my Intel i9-13900K
+ * Runs in 0.60s on my Intel i9-13900K
  * Perf stats:
- *     35,935,262,091      cpu_core/cycles/
- *     47,305,591,173      cpu_atom/cycles/
+ *     34,716,719,245      cpu_core/cycles/
+ *     40,776,530,892      cpu_atom/cycles/
  */
 public class CalculateAverage_thomaswue {
     private static final String FILE = "./measurements.txt";
@@ -112,25 +112,33 @@ private static TreeMap<String, Result> accumulateResults(List<List<Result>> allR
 
     // Main parse loop.
     private static Result[] parseLoop(long chunkStart, long chunkEnd) {
-        Result[] results = new Result[1 << 18];
+        Result[] results = new Result[1 << 17];
         Scanner scanner = new Scanner(chunkStart, chunkEnd);
+        long word = scanner.getLong();
+        int pos = findDelimiter(word);
         while (scanner.hasNext()) {
             long nameAddress = scanner.pos();
             long hash = 0;
 
             // Search for ';', one long at a time.
-            long word = scanner.getLong();
-            int pos = findDelimiter(word);
             if (pos != 8) {
                 scanner.add(pos);
                 word = mask(word, pos);
-                hash ^= word;
+                hash = word;
+
+                int number = scanNumber(scanner);
+                long nextWord = scanner.getLong();
+                int nextPos = findDelimiter(nextWord);
 
                 Result existingResult = results[hashToIndex(hash, results)];
                 if (existingResult != null && existingResult.lastNameLong == word) {
-                    scanAndRecord(scanner, existingResult);
+                    word = nextWord;
+                    pos = nextPos;
+                    record(existingResult, number);
                     continue;
                 }
+
+                scanner.setPos(nameAddress + pos);
             }
             else {
                 scanner.add(8);
@@ -142,9 +150,13 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
                     scanner.add(pos);
                     word = mask(word, pos);
                     hash ^= word;
+
                     Result existingResult = results[hashToIndex(hash, results)];
                     if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
-                        scanAndRecord(scanner, existingResult);
+                        int number = scanNumber(scanner);
+                        word = scanner.getLong();
+                        pos = findDelimiter(word);
+                        record(existingResult, number);
                         continue;
                     }
                 }
@@ -188,7 +200,7 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
                 int i = 0;
                 for (; i < nameLength + 1 - 8; i += 8) {
                     if (scanner.getLongAt(existingResult.nameAddress + i) != scanner.getLongAt(nameAddress + i)) {
-                        tableIndex = (tableIndex + 1) & (results.length - 1);
+                        tableIndex = (tableIndex + 31) & (results.length - 1);
                         continue outer;
                     }
                 }
@@ -198,20 +210,23 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
                 }
                 else {
                     // Collision error, try next.
-                    tableIndex = (tableIndex + 1) & (results.length - 1);
+                    tableIndex = (tableIndex + 31) & (results.length - 1);
                 }
             }
+
+            word = scanner.getLong();
+            pos = findDelimiter(word);
         }
         return results;
     }
 
-    private static void scanAndRecord(Scanner scanPtr, Result existingResult) {
+    private static int scanNumber(Scanner scanPtr) {
         scanPtr.add(1);
         long numberWord = scanPtr.getLong();
         int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
         int number = convertIntoNumber(decimalSepPos, numberWord);
         scanPtr.add((decimalSepPos >>> 3) + 3);
-        record(existingResult, number);
+        return number;
     }
 
     private static void record(Result existingResult, int number) {
@@ -222,8 +237,8 @@ private static void record(Result existingResult, int number) {
     }
 
     private static int hashToIndex(long hash, Result[] results) {
-        int hashAsInt = (int) (hash ^ (hash >>> 32));
-        int finalHash = (hashAsInt ^ (hashAsInt >>> 18));
+        int hashAsInt = (int) (hash ^ (hash >>> 28));
+        int finalHash = (hashAsInt ^ (hashAsInt >>> 15));
         return (finalHash & (results.length - 1));
     }
 
@@ -344,5 +359,9 @@ public String getString(int nameLength) {
             UNSAFE.copyMemory(null, pos, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameLength);
             return new String(bytes, StandardCharsets.UTF_8);
         }
+
+        public void setPos(long l) {
+            this.pos = l;
+        }
     }
-}
+}
\ No newline at end of file

From 6df2863cfcfb262b4abb3c56a84ba847f016d477 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 15 Jan 2024 20:43:50 +0100
Subject: [PATCH 030/268] Leaderboard update

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6e765cf74..88c356e5e 100644
--- a/README.md
+++ b/README.md
@@ -41,12 +41,12 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
+| 1 | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
 | 1 | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
-| 2 | 00:02.708 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
 | 3 | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+|   | 00:02.871 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
 |   | 00:02.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
-|   | 00:03.321 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
 |   | 00:03.409 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
@@ -55,12 +55,12 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
+|   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
+|   | 00:05.181 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.218 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
-|   | 00:05.339 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
+|   | 00:05.297 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
-|   | 00:05.530 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:05.351 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
@@ -114,6 +114,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.717 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_omarchenko4j.java)| 21.0.1-open | [Oleh Marchenko](https://github.com/omarchenko4j) |  |
 |   | 00:17.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hallvard.java)| 21.0.1-open | [Hallvard Trætteberg](https://github.com/hallvard) |  |
+|   | 00:17.932 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java)| 21.0.1-open | [Bartłomiej Pietrzyk](https://github.com/plbpietrz) |  |
 |   | 00:18.251 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_seijikun.java)| 21.0.1-graal | [Markus Ebner](https://github.com/seijikun) |  |
 |   | 00:18.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
 |   | 00:18.448 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_moysesb.java)| 21.0.1-open | [Moysés Borges Furtado](https://github.com/moysesb) |  |

From b7c24f95cdd8fb0f003c70dd6e5ebc3c87462273 Mon Sep 17 00:00:00 2001
From: Jin Cong Ho <jincongho@gmail.com>
Date: Mon, 15 Jan 2024 19:48:32 +0000
Subject: [PATCH 031/268] Submission #2: jincongho (#416)

---
 calculate_average_jincongho.sh                |   3 +-
 .../onebrc/CalculateAverage_jincongho.java    | 366 ++++++++++++++----
 2 files changed, 297 insertions(+), 72 deletions(-)

diff --git a/calculate_average_jincongho.sh b/calculate_average_jincongho.sh
index ec1ca426b..8edda54dd 100755
--- a/calculate_average_jincongho.sh
+++ b/calculate_average_jincongho.sh
@@ -15,6 +15,7 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED"
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector --enable-native-access=ALL-UNNAMED"
 JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+JAVA_OPTS="$JAVA_OPTS -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jincongho
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
index 01220ffbc..d2a7e6609 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
@@ -15,12 +15,16 @@
  */
 package dev.morling.onebrc;
 
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
 import sun.misc.Unsafe;
 
 import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.reflect.Field;
+import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
@@ -36,6 +40,7 @@
  * Parse key as byte vs string      30000 ms
  * Parse temp as fixed vs double    15000 ms
  * HashMap optimization             10000 ms
+ * Simd + reduce memory copy         8000 ms
  *
  */
 public class CalculateAverage_jincongho {
@@ -55,6 +60,115 @@ private static Unsafe initUnsafe() {
         }
     }
 
+    /**
+     * Vectorization utilities with 1BRC-specific optimizations
+     */
+    protected static class VectorUtils {
+
+        // key length is usually less than 32 bytes, having more is just expensive
+        public static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_256;
+
+        /** Vectorized field delimiter search **/
+
+        public static int findDelimiter(MemorySegment data, long offset) {
+            return ByteVector.fromMemorySegment(VectorUtils.BYTE_SPECIES, data, offset, ByteOrder.nativeOrder())
+                    .compare(VectorOperators.EQ, ';')
+                    .firstTrue();
+        }
+
+        /** Vectorized Hashing (explicit vectorization seems slower, overkill?) **/
+
+        // private static int[] HASH_ARRAY = initHashArray();
+        // private static final IntVector HASH_VECTOR = IntVector.fromArray(IntVector.SPECIES_256, HASH_ARRAY, 0);
+        // private static final int HASH_ACCUM = HASH_ARRAY[0] * 31;
+        //
+        // private static int[] initHashArray() {
+        // int[] x = new int[IntVector.SPECIES_256.length()];
+        // x[x.length - 1] = 1;
+        // for (int i = x.length - 2; i >= 0; i--)
+        // x[i] = x[i + 1] * 31;
+        //
+        // return x;
+        // }
+
+        /**
+         * Ref: https://github.com/PaulSandoz/vector-api-dev-live-10-2021/blob/main/src/main/java/jmh/BytesHashcode.java
+         *
+         * Essentially we are doing this calculation:
+         * h = h * 31 * 31 * 31 * 31 * 31 * 31 * 31 * 31 +
+         *         a[i + 0] * 31 * 31 * 31 * 31 * 31 * 31 * 31 +
+         *         a[i + 1] * 31 * 31 * 31 * 31 * 31 * 31 +
+         *         a[i + 2] * 31 * 31 * 31 * 31 * 31 +
+         *         a[i + 3] * 31 * 31 * 31 * 31 +
+         *         a[i + 4] * 31 * 31 * 31 +
+         *         a[i + 5] * 31 * 31 +
+         *         a[i + 6] * 31 +
+         *         a[i + 7];
+         */
+        // public static int hashCode(MemorySegment array, long offset, short length) {
+        // int h = 1;
+        // long i = offset, loopBound = offset + ByteVector.SPECIES_64.loopBound(length), tailBound = offset + length;
+        // for (; i < loopBound; i += ByteVector.SPECIES_64.length()) {
+        // // load 8 bytes, into a 64-bit vector
+        // ByteVector b = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, array, i, ByteOrder.nativeOrder());
+        // // convert 8 bytes into 8 ints (hashing calculation needs int!)
+        // IntVector x = (IntVector) b.castShape(IntVector.SPECIES_256, 0);
+        // h = h * HASH_ACCUM + x.mul(HASH_VECTOR).reduceLanes(VectorOperators.ADD);
+        // }
+        //
+        // for (; i < tailBound; i++) {
+        // h = 31 * h + array.get(ValueLayout.JAVA_BYTE, i);
+        // }
+        // return h;
+        // }
+
+        // scalar implementation
+        public static int hashCode(final MemorySegment array, final long offset, final short length) {
+            final long limit = offset + length;
+            int h = 1;
+            for (long i = offset; i < limit; i++) {
+                h = 31 * h + UNSAFE.getByte(array.address() + i);
+            }
+            return h;
+        }
+
+        /** Vectorized Key Comparison **/
+
+        private static boolean notEquals(MemorySegment a, long aOffset, MemorySegment b, long bOffset, short length, VectorSpecies BYTE_SPECIES) {
+            final long aLimit = aOffset + length, bLimit = bOffset + length;
+
+            // main loop
+            long loopBound = bOffset + BYTE_SPECIES.loopBound(length);
+            for (; bOffset < loopBound; aOffset += BYTE_SPECIES.length(), bOffset += BYTE_SPECIES.length()) {
+                ByteVector av = ByteVector.fromMemorySegment(BYTE_SPECIES, a,
+                        aOffset, ByteOrder.nativeOrder() /* , BYTE_SPECIES.indexInRange(aOffset, Math.min(aOffset + BYTE_SPECIES.length(), aLimit)) */);
+                ByteVector bv = ByteVector.fromMemorySegment(BYTE_SPECIES, b,
+                        bOffset, ByteOrder.nativeOrder() /* , BYTE_SPECIES.indexInRange(bOffset, Math.min(bOffset + BYTE_SPECIES.length(), bLimit)) */);
+                if (av.compare(VectorOperators.NE, bv).anyTrue())
+                    return true;
+            }
+
+            // tail cleanup - load last N bytes with mask
+            if (bOffset < bLimit) {
+                ByteVector av = ByteVector.fromMemorySegment(BYTE_SPECIES, a, aOffset, ByteOrder.nativeOrder(), BYTE_SPECIES.indexInRange(aOffset, aLimit));
+                ByteVector bv = ByteVector.fromMemorySegment(BYTE_SPECIES, b, bOffset, ByteOrder.nativeOrder(), BYTE_SPECIES.indexInRange(bOffset, bLimit));
+                if (av.compare(VectorOperators.NE, bv).anyTrue())
+                    return true;
+            }
+
+            return false;
+        }
+
+        // scalar implementation
+        // private static boolean equals(byte[] a, int aOffset, byte[] b, int bOffset, int len) {
+        // while (bOffset < len)
+        // if (a[aOffset++] != b[bOffset++])
+        // return false;
+        // return true;
+        // }
+
+    }
+
     /**
      * Measurement Hash Table (for each partition)
      * Uses contiguous byte array to optimize for cache-line (hopefully)
@@ -70,26 +184,27 @@ protected static class PartitionAggr {
         private static int KEY_MASK = (MAP_SIZE - 1);
         private static int VALUE_SIZE = 16; // min (2 bytes) + max ( 2 bytes) + count (4 bytes) + sum (8 bytes)
 
-        private byte[] KEYS = new byte[MAP_SIZE * KEY_SIZE];
-        private byte[] VALUES = new byte[MAP_SIZE * VALUE_SIZE];
+        private MemorySegment KEYS = Arena.ofShared().allocate(MAP_SIZE * KEY_SIZE, 64);
+        private MemorySegment VALUES = Arena.ofShared().allocate(MAP_SIZE * VALUE_SIZE, 16);
 
         public PartitionAggr() {
             // init min and max
-            for (int offset = UNSAFE.ARRAY_BYTE_BASE_OFFSET; offset < UNSAFE.ARRAY_BYTE_BASE_OFFSET + (MAP_SIZE * VALUE_SIZE); offset += VALUE_SIZE) {
-                UNSAFE.putShort(VALUES, offset, Short.MAX_VALUE);
-                UNSAFE.putShort(VALUES, offset + 2, Short.MIN_VALUE);
+            final long limit = VALUES.address() + (MAP_SIZE * VALUE_SIZE);
+            for (long offset = VALUES.address(); offset < limit; offset += VALUE_SIZE) {
+                UNSAFE.putShort(offset, Short.MAX_VALUE);
+                UNSAFE.putShort(offset + 2, Short.MIN_VALUE);
             }
         }
 
-        public void update(byte[] key, int hash, short keyLength, short value) {
-            int index = hash & KEY_MASK;
-            int keyOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (index * KEY_SIZE);
-            while (((UNSAFE.getShort(KEYS, keyOffset) != keyLength) ||
-                    !equals(KEYS, ((index * KEY_SIZE) + 2), key, 0, keyLength))) {
-                if (UNSAFE.getShort(KEYS, keyOffset) == 0) {
+        public void update(MemorySegment key, long keyStart, short keyLength, int keyHash, short value) {
+            int index = keyHash & KEY_MASK;
+            long keyOffset = KEYS.address() + (index * KEY_SIZE);
+            while (((UNSAFE.getShort(keyOffset) != keyLength) ||
+                    VectorUtils.notEquals(KEYS, ((index * KEY_SIZE) + 2), key, keyStart, keyLength, VectorUtils.BYTE_SPECIES))) {
+                if (UNSAFE.getShort(keyOffset) == 0) {
                     // put key
-                    UNSAFE.putShort(KEYS, keyOffset, keyLength);
-                    UNSAFE.copyMemory(key, UNSAFE.ARRAY_BYTE_BASE_OFFSET, KEYS, keyOffset + 2, keyLength);
+                    UNSAFE.putShort(keyOffset, keyLength);
+                    MemorySegment.copy(key, keyStart, KEYS, (index * KEY_SIZE) + 2, keyLength);
                     break;
                 }
                 else {
@@ -98,21 +213,14 @@ public void update(byte[] key, int hash, short keyLength, short value) {
                 }
             }
 
-            long valueOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (index * VALUE_SIZE);
-            UNSAFE.putShort(VALUES, valueOffset, (short) Math.min(UNSAFE.getShort(VALUES, valueOffset), value));
+            long valueOffset = VALUES.address() + (index * VALUE_SIZE);
+            UNSAFE.putShort(valueOffset, (short) Math.min(UNSAFE.getShort(valueOffset), value));
             valueOffset += 2;
-            UNSAFE.putShort(VALUES, valueOffset, (short) Math.max(UNSAFE.getShort(VALUES, valueOffset), value));
+            UNSAFE.putShort(valueOffset, (short) Math.max(UNSAFE.getShort(valueOffset), value));
             valueOffset += 2;
-            UNSAFE.putInt(VALUES, valueOffset, UNSAFE.getInt(VALUES, valueOffset) + 1);
+            UNSAFE.putInt(valueOffset, UNSAFE.getInt(valueOffset) + 1);
             valueOffset += 4;
-            UNSAFE.putLong(VALUES, valueOffset, UNSAFE.getLong(VALUES, valueOffset) + value);
-        }
-
-        private boolean equals(byte[] a, int aOffset, byte[] b, int bOffset, int len) {
-            while (bOffset < len)
-                if (a[aOffset++] != b[bOffset++])
-                    return false;
-            return true;
+            UNSAFE.putLong(valueOffset, UNSAFE.getLong(valueOffset) + value);
         }
 
         public void mergeTo(ResultAggr result) {
@@ -120,24 +228,22 @@ public void mergeTo(ResultAggr result) {
             short keyLength;
             for (int i = 0; i < MAP_SIZE; i++) {
                 // extract key
-                keyOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (i * KEY_SIZE);
-                if ((keyLength = UNSAFE.getShort(KEYS, keyOffset)) == 0)
+                keyOffset = KEYS.address() + (i * KEY_SIZE);
+                if ((keyLength = UNSAFE.getShort(keyOffset)) == 0)
                     continue;
 
                 // extract values (if key is not null)
-                final long valueOffset = UNSAFE.ARRAY_BYTE_BASE_OFFSET + (i * VALUE_SIZE);
-                result.compute(new String(KEYS, (i * KEY_SIZE) + 2, keyLength, StandardCharsets.UTF_8), (k, v) -> {
-                    short min = UNSAFE.getShort(VALUES, valueOffset);
-                    short max = UNSAFE.getShort(VALUES, valueOffset + 2);
-                    int count = UNSAFE.getInt(VALUES, valueOffset + 4);
-                    long sum = UNSAFE.getLong(VALUES, valueOffset + 8);
-
+                final long valueOffset = VALUES.address() + (i * VALUE_SIZE);
+                result.compute(new ResultAggr.ByteKey(KEYS, (i * KEY_SIZE) + 2, keyLength), (k, v) -> {
                     if (v == null) {
-                        return new ResultAggr.Measurement(min, max, count, sum);
-                    }
-                    else {
-                        return v.update(min, max, count, sum);
+                        v = new ResultAggr.Measurement();
                     }
+                    v.min = (short) Math.min(UNSAFE.getShort(valueOffset), v.min);
+                    v.max = (short) Math.max(UNSAFE.getShort(valueOffset + 2), v.max);
+                    v.count += UNSAFE.getInt(valueOffset + 4);
+                    v.sum += UNSAFE.getLong(valueOffset + 8);
+
+                    return v;
                 });
             }
         }
@@ -148,30 +254,56 @@ public void mergeTo(ResultAggr result) {
      * Measurement Aggregation (for all partitions)
      * Simple Concurrent Hash Table so all partitions can merge concurrently
      */
-    protected static class ResultAggr extends ConcurrentHashMap<String, ResultAggr.Measurement> {
+    protected static class ResultAggr extends ConcurrentHashMap<ResultAggr.ByteKey, ResultAggr.Measurement> {
+
+        public static class ByteKey implements Comparable<ByteKey> {
+            private final MemorySegment data;
+            private final long offset;
+            private final short length;
+            private String str;
+
+            public ByteKey(MemorySegment data, long offset, short length) {
+                this.data = data;
+                this.offset = offset;
+                this.length = length;
+            }
 
-        protected static class Measurement {
-            public short min;
-            public short max;
-            public int count;
-            public long sum;
-
-            public Measurement(short min, short max, int count, long sum) {
-                this.min = min;
-                this.max = max;
-                this.count = count;
-                this.sum = sum;
+            @Override
+            public boolean equals(Object other) {
+                if (length != ((ByteKey) other).length)
+                    return false;
+
+                return !VectorUtils.notEquals(data, offset, ((ByteKey) other).data, ((ByteKey) other).offset, length, VectorUtils.BYTE_SPECIES);
             }
 
-            public ResultAggr.Measurement update(short min, short max, int count, long sum) {
-                this.min = (short) Math.min(min, this.min);
-                this.max = (short) Math.max(max, this.max);
-                this.count += count;
-                this.sum += sum;
+            @Override
+            public int hashCode() {
+                return VectorUtils.hashCode(data, offset, length);
+            }
 
-                return this;
+            @Override
+            public String toString() {
+                if (str == null) {
+                    // finally has to do a copy!
+                    byte[] copy = new byte[length];
+                    MemorySegment.copy(data, offset, MemorySegment.ofArray(copy), 0, length);
+                    str = new String(copy, StandardCharsets.UTF_8);
+                }
+                return str;
             }
 
+            @Override
+            public int compareTo(ByteKey o) {
+                return toString().compareTo(o.toString());
+            }
+        }
+
+        protected static class Measurement {
+            public short min = Short.MAX_VALUE;
+            public short max = Short.MIN_VALUE;
+            public int count = 0;
+            public long sum = 0;
+
             @Override
             public String toString() {
                 return ((double) min / 10) + "/" + (Math.round((1.0 * sum) / count) / 10.0) + "/" + ((double) max / 10);
@@ -179,6 +311,10 @@ public String toString() {
 
         }
 
+        public ResultAggr(int initialCapacity, float loadFactor, int concurrencyLevel) {
+            super(initialCapacity, loadFactor, concurrencyLevel);
+        }
+
         public Map toSorted() {
             return new TreeMap(this);
         }
@@ -194,8 +330,8 @@ protected static class Partition implements Runnable {
 
         public Partition(MemorySegment data, long offset, long limit, ResultAggr result) {
             this.data = data;
-            this.offset = data.address() + offset;
-            this.limit = data.address() + limit;
+            this.offset = offset;
+            this.limit = limit;
             this.result = result;
         }
 
@@ -203,25 +339,57 @@ public Partition(MemorySegment data, long offset, long limit, ResultAggr result)
         public void run() {
             // measurement parsing
             PartitionAggr aggr = new PartitionAggr();
-            byte[] stationName = new byte[128];
-            short stationLength;
-            int hash;
-            byte tempBuffer;
+
+            // main loop (vectorized)
+            final long loopLimit = limit - (VectorUtils.BYTE_SPECIES.length() * Math.ceilDiv(100, VectorUtils.BYTE_SPECIES.length()) + Long.BYTES);
+            while (offset < loopLimit) {
+                long offsetStart = offset;
+
+                // find station name upto ";"
+                int found;
+                do {
+                    found = VectorUtils.findDelimiter(data, offset);
+                    offset += found;
+                } while (found == VectorUtils.BYTE_SPECIES.length());
+                short stationLength = (short) (offset - offsetStart);
+                int stationHash = VectorUtils.hashCode(data, offsetStart, stationLength);
+
+                // find measurement upto "\n" (credit: merykitty)
+                long numberBits = UNSAFE.getLong(data.address() + ++offset);
+                final long invNumberBits = ~numberBits;
+                final int decimalSepPos = Long.numberOfTrailingZeros(invNumberBits & 0x10101000);
+
+                int shift = 28 - decimalSepPos;
+                long signed = (invNumberBits << 59) >> 63;
+                long designMask = ~(signed & 0xFF);
+                long digits = ((numberBits & designMask) << shift) & 0x0F000F0F00L;
+                long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+
+                short fixed = (short) ((absValue ^ signed) - signed);
+                offset += (decimalSepPos >>> 3) + 3;
+
+                // update measurement
+                aggr.update(data, offsetStart, stationLength, stationHash, fixed);
+            }
+
+            // tail loop (simple)
             while (offset < limit) {
+                long offsetStart = offset;
+
                 // find station name upto ";"
-                hash = 1;
-                stationLength = 0;
-                while ((stationName[stationLength] = UNSAFE.getByte(offset++)) != ';')
-                    hash = hash * 31 + stationName[stationLength++];
+                short stationLength = 0;
+                while (UNSAFE.getByte(data.address() + offset++) != ';')
+                    stationLength++;
+                int stationHash = VectorUtils.hashCode(data, offsetStart, stationLength);
 
                 // find measurement upto "\n"
-                tempBuffer = UNSAFE.getByte(offset++);
+                byte tempBuffer = UNSAFE.getByte(data.address() + offset++);
                 boolean isNegative = (tempBuffer == '-');
                 short fixed = (short) (isNegative ? 0 : (tempBuffer - '0'));
                 while (true) {
-                    tempBuffer = UNSAFE.getByte(offset++);
+                    tempBuffer = UNSAFE.getByte(data.address() + offset++);
                     if (tempBuffer == '.') {
-                        fixed = (short) (fixed * 10 + (UNSAFE.getByte(offset) - '0'));
+                        fixed = (short) (fixed * 10 + (UNSAFE.getByte(data.address() + offset) - '0'));
                         offset += 2;
                         break;
                     }
@@ -230,7 +398,7 @@ public void run() {
                 fixed = isNegative ? (short) -fixed : fixed;
 
                 // update measurement
-                aggr.update(stationName, hash, stationLength, fixed);
+                aggr.update(data, offsetStart, stationLength, stationHash, fixed);
             }
 
             // measurement result collection
@@ -259,13 +427,15 @@ public static void main(String[] args) throws IOException, InterruptedException
                     partition[i + 1] = data.byteSize();
                     break;
                 }
+
+                // note: vectorize this made performance worse :(
                 while (UNSAFE.getByte(data.address() + partition[i + 1]++) != '\n')
                     ;
             }
 
             // partition aggregation
             var threadList = new Thread[processors];
-            ResultAggr result = new ResultAggr();
+            ResultAggr result = new ResultAggr(1 << 14, 1, processors);
             for (int i = 0; i < processors; i++) {
                 threadList[i] = new Thread(new Partition(data, partition[i], partition[i + 1], result));
                 threadList[i].start();
@@ -282,4 +452,58 @@ public static void main(String[] args) throws IOException, InterruptedException
 
     }
 
+    /** Unit Tests **/
+
+    public static void testMain(String[] args) {
+        testHashCode();
+        testNotEquals();
+    }
+
+    private static void testHashCode() {
+        // test key length from 1 to 100
+        for (int i = 1; i <= 100; i++) {
+            byte[] array = new byte[i];
+            for (int j = 0; j < i; j++)
+                array[j] = (byte) j;
+
+            // compare with java default implementation
+            assertTrue(VectorUtils.hashCode(MemorySegment.ofArray(array), 0, (short) i) == Arrays.hashCode(array));
+        }
+    }
+
+    private static void testNotEquals() {
+        byte[] a = new byte[128];
+        byte[] b = new byte[128];
+
+        // all equals
+        for (int i = 1; i < 100; i++) {
+            a[(i + 2) - 1] = 0;
+            b[i - 1] = 0;
+            a[(i + 2)] = 10;
+            b[i] = 10;
+            assertTrue(!VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_64));
+            assertTrue(!VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_128));
+            assertTrue(!VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_256));
+            assertTrue(!VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_512));
+        }
+
+        // one el not equals
+        for (int i = 1; i < 100; i++) {
+            a[(i + 2) - 1] = 0;
+            b[i - 1] = 0;
+            a[(i + 2)] = 20;
+            b[i] = 10;
+            assertTrue(VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_64));
+            assertTrue(VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_128));
+            assertTrue(VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_256));
+            assertTrue(VectorUtils.notEquals(MemorySegment.ofArray(a), 2, MemorySegment.ofArray(b), 0, (short) 100, ByteVector.SPECIES_512));
+        }
+    }
+
+    private static void assertTrue(boolean condition) {
+        if (!condition) {
+            throw new RuntimeException("Failed test");
+        }
+    }
+
 }

From 07ac6a53c365cbbb6ea7ce4d2fc2b12b771b604e Mon Sep 17 00:00:00 2001
From: Farid <faridmammadov@outlook.com>
Date: Tue, 16 Jan 2024 00:01:16 +0400
Subject: [PATCH 032/268] CalculateAverage_faridtmammadov (#406)

* create calculate average frd

* rename to mach github username

* add licesnce header

* make script executable

---------

Co-authored-by: Farid Mammadov <farid.mammadov@simbrella.com>
---
 calculate_average_faridtmammadov.sh           |  21 ++
 .../CalculateAverage_faridtmammadov.java      | 203 ++++++++++++++++++
 2 files changed, 224 insertions(+)
 create mode 100755 calculate_average_faridtmammadov.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java

diff --git a/calculate_average_faridtmammadov.sh b/calculate_average_faridtmammadov.sh
new file mode 100755
index 000000000..c521e9a1b
--- /dev/null
+++ b/calculate_average_faridtmammadov.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_faridtmammadov
+
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java b/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java
new file mode 100644
index 000000000..f4b920bb0
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java
@@ -0,0 +1,203 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+public class CalculateAverage_faridtmammadov {
+    private static final String FILE = "./measurements.txt";
+
+    public static void main(String[] args) throws IOException {
+        int availableProcessors = Runtime.getRuntime().availableProcessors();
+
+        var map = getSegments(availableProcessors).stream()
+                .map(CalculateAverage_faridtmammadov::aggregate).parallel()
+                .flatMap(f -> f.entrySet().stream())
+                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, Aggregate::update, TreeMap::new));
+
+        printFormatted(map);
+    }
+
+    private static List<MemorySegment> getSegments(int numberOfChunks) throws IOException {
+        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+            var fileSize = fileChannel.size();
+            var segmentSize = fileSize / numberOfChunks;
+            var segment = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
+            var baseAddress = segment.address();
+            var endAddress = baseAddress + fileSize;
+            var segments = new ArrayList<MemorySegment>();
+            var startAddress = baseAddress;
+
+            for (var i = 0; i < numberOfChunks; i++) {
+                var pointer = startAddress + segmentSize;
+                while (pointer < endAddress) {
+                    long offset = pointer - baseAddress;
+                    byte b = segment.get(ValueLayout.JAVA_BYTE, offset);
+                    if (b == '\n') {
+                        break;
+                    }
+                    pointer++;
+                }
+                if (pointer >= endAddress) {
+                    var offsetStart = startAddress - baseAddress;
+                    var offsetEnd = endAddress - baseAddress - offsetStart;
+                    segments.add(segment.asSlice(offsetStart, offsetEnd));
+                    break;
+                }
+                var offsetStart = startAddress - baseAddress;
+                var offsetEnd = pointer - baseAddress - offsetStart;
+                segments.add(segment.asSlice(offsetStart, offsetEnd));
+                startAddress = pointer + 1;
+            }
+
+            return segments;
+        }
+    }
+
+    private static Map<String, Aggregate> aggregate(MemorySegment segment) {
+        var map = new HashMap<String, Aggregate>();
+        var iterator = new MemorySegmentIterator(segment);
+
+        while (iterator.hasNext()) {
+            String city = parseCity(iterator);
+            long temperature = parseTemperature(iterator);
+
+            map.compute(city, (key, value) -> {
+                if (value == null) {
+                    return new Aggregate(temperature);
+                }
+                else {
+                    return value.update(temperature);
+                }
+            });
+        }
+
+        return map;
+    }
+
+    private static String parseCity(MemorySegmentIterator iterator) {
+        var byteStream = new ByteArrayOutputStream();
+        while (iterator.hasNext()) {
+            var b = iterator.getNextByte();
+            if (b == ';') {
+                return byteStream.toString(StandardCharsets.UTF_8);
+            }
+            byteStream.write(b);
+        }
+
+        return null;
+    }
+
+    public static long parseTemperature(MemorySegmentIterator iterator) {
+        long value = 0L;
+        int sign = 1;
+        while (iterator.hasNext()) {
+            byte b = iterator.getNextByte();
+            if (b >= '0' && b <= '9') {
+                value = value * 10 + b - '0';
+            }
+            else if (b == '\n') {
+                return value * sign;
+            }
+            else if (b == '-') {
+                sign = -1;
+            }
+        }
+
+        return value * sign;
+    }
+
+    private static void printFormatted(Map<String, Aggregate> map) {
+        var iterator = map.entrySet().iterator();
+        var length = map.entrySet().size();
+        System.out.print("{");
+        for (int i = 0; i < length - 1; i++) {
+            var entry = iterator.next();
+            System.out.printf("%s=%s, ", entry.getKey(), entry.getValue().toString());
+        }
+        var lastEntry = iterator.next();
+        System.out.printf("%s=%s}\n", lastEntry.getKey(), lastEntry.getValue().toString());
+    }
+
+    static class Aggregate {
+        long min;
+        long max;
+        long sum;
+        int count;
+
+        public Aggregate(long temperature) {
+            min = temperature;
+            max = temperature;
+            sum = temperature;
+            count = 1;
+        }
+
+        public Aggregate update(long temp) {
+            min = Math.min(min, temp);
+            max = Math.max(max, temp);
+            sum += temp;
+            count++;
+            return this;
+        }
+
+        public Aggregate update(Aggregate agg) {
+            min = Math.min(min, agg.min);
+            max = Math.max(max, agg.max);
+            sum += agg.sum;
+            count += agg.count;
+            return this;
+        }
+
+        public String toString() {
+            return String.format("%s/%s/%s", min / 10.0f, Math.round(sum * 1.0f / count) / 10.0f, max / 10.0f);
+        }
+    }
+
+    static class MemorySegmentIterator {
+        private long offset;
+        private final MemorySegment segment;
+        private final long segmentSize;
+
+        public MemorySegmentIterator(MemorySegment segment) {
+            this.segment = segment;
+            this.segmentSize = segment.byteSize();
+        }
+
+        public boolean hasNext() {
+            return offset < segmentSize;
+        }
+
+        public byte getNextByte() {
+            var b = segment.get(ValueLayout.JAVA_BYTE, offset);
+            offset++;
+            return b;
+        }
+    }
+}
\ No newline at end of file

From 116f623f3e0d6a252c4f4a5673ba4f8ae4180f1a Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 15 Jan 2024 21:07:30 +0100
Subject: [PATCH 033/268] Leaderboard update

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 88c356e5e..a9e8a605e 100644
--- a/README.md
+++ b/README.md
@@ -41,9 +41,10 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
-| 1 | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
-| 3 | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+| 1* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
+| 1* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
+| 3 | 00:02.621 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
+|   | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
 |   | 00:02.871 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
 |   | 00:02.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
@@ -69,7 +70,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
-|   | 00:06.946 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
+|   | 00:06.911 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.809 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
@@ -112,6 +113,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
+|   | 00:17.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java)| 21.0.1-open | [Farid](https://github.com/faridtmammadov) |  |
 |   | 00:17.717 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_omarchenko4j.java)| 21.0.1-open | [Oleh Marchenko](https://github.com/omarchenko4j) |  |
 |   | 00:17.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hallvard.java)| 21.0.1-open | [Hallvard Trætteberg](https://github.com/hallvard) |  |
 |   | 00:17.932 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java)| 21.0.1-open | [Bartłomiej Pietrzyk](https://github.com/plbpietrz) |  |
@@ -152,6 +154,8 @@ These are the results from running all entries into the challenge on eight cores
 |   | ---       | | | | |
 |   | 04:49.679 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_baseline.java) (Baseline) | 21.0.1-open | [Gunnar Morling](https://github.com/gunnarmorling) |  |
 
+\* These two entries have such a similar runtime (below the error margin I can reliably measure), that they share position #1 in the leaderboar.
+
 Note that I am not super-scientific in the way I'm running the contenders
 (see [Evaluating Results](#evaluating-results) for the details).
 This is not a high-fidelity micro-benchmark and there can be variations of ~ +-5% between runs.

From 073d3aecdfee7c11bc40e189defb7e2cd5d498cb Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Mon, 15 Jan 2024 21:07:53 +0100
Subject: [PATCH 034/268] native version (#434)

---
 calculate_average_artsiomkorzun.sh            | 10 ++++--
 prepare_artsiomkorzun.sh                      |  5 +++
 .../CalculateAverage_artsiomkorzun.java       | 36 ++++++++++++-------
 3 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/calculate_average_artsiomkorzun.sh b/calculate_average_artsiomkorzun.sh
index 96e3467d2..d9c18284e 100755
--- a/calculate_average_artsiomkorzun.sh
+++ b/calculate_average_artsiomkorzun.sh
@@ -15,5 +15,11 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_artsiomkorzun
+if [ -f target/CalculateAverage_artsiomkorzun_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_artsiomkorzun_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_artsiomkorzun_image
+else
+    JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation"
+    echo "Chosing to run the app in JVM mode as no native image was found, use prepare_artsiomkorzun.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_artsiomkorzun
+fi
\ No newline at end of file
diff --git a/prepare_artsiomkorzun.sh b/prepare_artsiomkorzun.sh
index f83a3ff69..9ae693a79 100755
--- a/prepare_artsiomkorzun.sh
+++ b/prepare_artsiomkorzun.sh
@@ -17,3 +17,8 @@
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.1-graal 1>&2
+
+if [ ! -f target/CalculateAverage_artsiomkorzun_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=64m --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_artsiomkorzun"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_artsiomkorzun_image dev.morling.onebrc.CalculateAverage_artsiomkorzun
+fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index f92f41422..4ea9d5833 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -31,12 +31,8 @@
 public class CalculateAverage_artsiomkorzun {
 
     private static final Path FILE = Path.of("./measurements.txt");
-    private static final MemorySegment MAPPED_FILE = map(FILE);
-
-    private static final int PARALLELISM = Runtime.getRuntime().availableProcessors();
-    private static final int SEGMENT_SIZE = 32 * 1024 * 1024;
-    private static final int SEGMENT_COUNT = (int) ((MAPPED_FILE.byteSize() + SEGMENT_SIZE - 1) / SEGMENT_SIZE);
-    private static final int SEGMENT_OVERLAP = 1024;
+    private static final long SEGMENT_SIZE = 32 * 1024 * 1024;
+    private static final long SEGMENT_OVERLAP = 1024;
     private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
@@ -66,12 +62,19 @@ public static void main(String[] args) throws Exception {
     }
 
     private static void execute() throws Exception {
+        MemorySegment fileMemory = map(FILE);
+        long fileAddress = fileMemory.address();
+        long fileSize = fileMemory.byteSize();
+        int segmentCount = (int) ((fileSize + SEGMENT_SIZE - 1) / SEGMENT_SIZE);
+
         AtomicInteger counter = new AtomicInteger();
         AtomicReference<Aggregates> result = new AtomicReference<>();
-        Aggregator[] aggregators = new Aggregator[PARALLELISM];
+
+        int parallelism = Runtime.getRuntime().availableProcessors();
+        Aggregator[] aggregators = new Aggregator[parallelism];
 
         for (int i = 0; i < aggregators.length; i++) {
-            aggregators[i] = new Aggregator(counter, result);
+            aggregators[i] = new Aggregator(counter, result, fileAddress, fileSize, segmentCount);
             aggregators[i].start();
         }
 
@@ -306,21 +309,28 @@ private static class Aggregator extends Thread {
 
         private final AtomicInteger counter;
         private final AtomicReference<Aggregates> result;
+        private final long fileAddress;
+        private final long fileSize;
+        private final int segmentCount;
 
-        public Aggregator(AtomicInteger counter, AtomicReference<Aggregates> result) {
+        public Aggregator(AtomicInteger counter, AtomicReference<Aggregates> result,
+                          long fileAddress, long fileSize, int segmentCount) {
             super("aggregator");
             this.counter = counter;
             this.result = result;
+            this.fileAddress = fileAddress;
+            this.fileSize = fileSize;
+            this.segmentCount = segmentCount;
         }
 
         @Override
         public void run() {
             Aggregates aggregates = new Aggregates();
 
-            for (int segment; (segment = counter.getAndIncrement()) < SEGMENT_COUNT;) {
-                long position = (long) SEGMENT_SIZE * segment;
-                int size = (int) Math.min(SEGMENT_SIZE + SEGMENT_OVERLAP, MAPPED_FILE.byteSize() - position);
-                long address = MAPPED_FILE.address() + position;
+            for (int segment; (segment = counter.getAndIncrement()) < segmentCount;) {
+                long position = SEGMENT_SIZE * segment;
+                long size = Math.min(SEGMENT_SIZE + SEGMENT_OVERLAP, fileSize - position);
+                long address = fileAddress + position;
                 long limit = address + Math.min(SEGMENT_SIZE, size - 1);
 
                 if (segment > 0) {

From f8874c3886a16a2403cc20ea717f4972debea0ec Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 15 Jan 2024 21:21:17 +0100
Subject: [PATCH 035/268] Leaderboard update

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index a9e8a605e..17a98fb88 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,6 @@ These are the results from running all entries into the challenge on eight cores
 | 1* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
 | 3 | 00:02.621 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
 |   | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
-|   | 00:02.871 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
 |   | 00:02.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |

From c080143ca8daee3d71ef098e6961eaf034c379d6 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Tue, 16 Jan 2024 21:54:42 +0100
Subject: [PATCH 036/268] fix masking (#442)

fix masking

fix masking
---
 .../dev/morling/onebrc/CalculateAverage_artsiomkorzun.java     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index 4ea9d5833..c3c39ab3f 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -431,7 +431,8 @@ private static long separator(long word) {
         }
 
         private static long mask(long word, long separator) {
-            return word & ((separator >>> 7) - 1) & 0x00FFFFFFFFFFFFFFL;
+            long mask = ((separator - 1) ^ separator) >>> 8;
+            return word & mask;
         }
 
         private static int length(long separator) {

From b1e6a120a47cb55e0cdadcae82e62e53a4bad232 Mon Sep 17 00:00:00 2001
From: Keshavram Kuduwa <131107576+kuduwa-keshavram@users.noreply.github.com>
Date: Wed, 17 Jan 2024 02:32:26 +0530
Subject: [PATCH 037/268] Optimised code to iterate over non-null measurements
 (#444)

Co-authored-by: Keshavram Kuduwa <keshavram.kuduwa@apptware.com>
---
 calculate_average_kuduwa-keshavram.sh         |   2 +-
 .../CalculateAverage_kuduwa_keshavram.java    | 215 ++++++++++--------
 2 files changed, 122 insertions(+), 95 deletions(-)

diff --git a/calculate_average_kuduwa-keshavram.sh b/calculate_average_kuduwa-keshavram.sh
index 904c8db88..33941d335 100755
--- a/calculate_average_kuduwa-keshavram.sh
+++ b/calculate_average_kuduwa-keshavram.sh
@@ -16,5 +16,5 @@
 #
 
 
-JAVA_OPTS=""
+JAVA_OPTS="--enable-preview"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_kuduwa_keshavram
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java b/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java
index c61116656..68ace02cf 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java
@@ -17,75 +17,77 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.RandomAccessFile;
-import java.nio.ByteOrder;
-import java.nio.MappedByteBuffer;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
 import java.nio.channels.FileChannel;
 import java.nio.channels.FileChannel.MapMode;
-import java.nio.file.Files;
-import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Objects;
+import java.util.Iterator;
+import java.util.Spliterator;
+import java.util.Spliterators;
 import java.util.TreeMap;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+import sun.misc.Unsafe;
 
 public class CalculateAverage_kuduwa_keshavram {
 
     private static final String FILE = "./measurements.txt";
+    private static final Unsafe UNSAFE = initUnsafe();
+
+    private static Unsafe initUnsafe() {
+        try {
+            final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
 
     public static void main(String[] args) throws IOException, InterruptedException {
-        TreeMap<String, Measurement> resultMap = getFileSegments(new File(FILE)).stream()
-                .parallel()
-                .map(
+        TreeMap<String, Measurement> resultMap = getFileSegments(new File(FILE))
+                .flatMap(
                         segment -> {
-                            final Measurement[][] measurements = new Measurement[1024 * 128][3];
-                            try (FileChannel fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE), StandardOpenOption.READ)) {
-                                MappedByteBuffer byteBuffer = fileChannel.map(
-                                        MapMode.READ_ONLY, segment.start, segment.end - segment.start);
-                                byteBuffer.order(ByteOrder.nativeOrder());
-                                while (byteBuffer.hasRemaining()) {
-                                    byte[] city = new byte[100];
-                                    byte b;
-                                    int hash = 0;
-                                    int i = 0;
-                                    while ((b = byteBuffer.get()) != 59) {
-                                        hash = 31 * hash + b;
-                                        city[i++] = b;
-                                    }
+                            Result result = new Result();
+                            while (segment.start < segment.end) {
+                                byte[] city = new byte[100];
+                                byte b;
+                                int hash = 0;
+                                int i = 0;
+                                while ((b = UNSAFE.getByte(segment.start++)) != 59) {
+                                    hash = 31 * hash + b;
+                                    city[i++] = b;
+                                }
 
-                                    byte[] newCity = new byte[i];
-                                    System.arraycopy(city, 0, newCity, 0, i);
-                                    int measurement = 0;
-                                    boolean negative = false;
-                                    while ((b = byteBuffer.get()) != 10) {
-                                        if (b == 45) {
-                                            negative = true;
-                                        }
-                                        else if (b == 46) {
-                                            // skip
-                                        }
-                                        else {
-                                            final int n = b - '0';
-                                            measurement = measurement * 10 + n;
-                                        }
+                                byte[] newCity = new byte[i];
+                                System.arraycopy(city, 0, newCity, 0, i);
+                                int measurement = 0;
+                                boolean negative = false;
+                                while ((b = UNSAFE.getByte(segment.start++)) != 10) {
+                                    if (b == 45) {
+                                        negative = true;
+                                    }
+                                    else if (b == 46) {
+                                        // skip
+                                    }
+                                    else {
+                                        final int n = b - '0';
+                                        measurement = measurement * 10 + n;
                                     }
-                                    putOrMerge(
-                                            measurements,
-                                            new Measurement(
-                                                    hash, newCity, negative ? measurement * -1 : measurement));
                                 }
+                                putOrMerge(
+                                        result,
+                                        new Measurement(hash, newCity, negative ? measurement * -1 : measurement));
                             }
-                            catch (IOException e) {
-                                throw new RuntimeException(e);
-                            }
-                            return measurements;
+                            Iterator<Measurement> iterator = getMeasurementIterator(result);
+                            return StreamSupport.stream(
+                                    Spliterators.spliteratorUnknownSize(iterator, Spliterator.NONNULL), true);
                         })
-                .flatMap(measurements -> Arrays.stream(measurements).flatMap(Arrays::stream))
-                .filter(Objects::nonNull)
                 .collect(
                         Collectors.toMap(
                                 measurement -> new String(measurement.city),
@@ -99,13 +101,48 @@ else if (b == 46) {
         System.out.println(resultMap);
     }
 
-    private static void putOrMerge(Measurement[][] measurements, Measurement measurement) {
-        int index = measurement.hash & (measurements.length - 1);
-        Measurement[] existing = measurements[index];
+    private static Iterator<Measurement> getMeasurementIterator(Result result) {
+        return new Iterator<>() {
+            final int uniqueIndex = result.uniqueIndex;
+            final int[] indexArray = result.indexArray;
+            final Measurement[][] measurements = result.measurements;
+
+            int i = 0;
+            int j = 0;
+
+            @Override
+            public boolean hasNext() {
+                return i < uniqueIndex;
+            }
+
+            @Override
+            public Measurement next() {
+                Measurement measurement = measurements[indexArray[i]][j++];
+                if (measurements[indexArray[i]][j] == null) {
+                    i++;
+                    j = 0;
+                }
+                return measurement;
+            }
+        };
+    }
+
+    static class Result {
+        final Measurement[][] measurements = new Measurement[1024 * 128][3];
+        final int[] indexArray = new int[10_000];
+        int uniqueIndex = 0;
+    }
+
+    private static void putOrMerge(Result result, Measurement measurement) {
+        int index = measurement.hash & (result.measurements.length - 1);
+        Measurement[] existing = result.measurements[index];
         for (int i = 0; i < existing.length; i++) {
             Measurement existingMeasurement = existing[i];
             if (existingMeasurement == null) {
-                measurements[index][i] = measurement;
+                result.measurements[index][i] = measurement;
+                if (i == 0) {
+                    result.indexArray[result.uniqueIndex++] = index;
+                }
                 return;
             }
             if (equals(existingMeasurement.city, measurement.city)) {
@@ -124,13 +161,20 @@ private static boolean equals(byte[] city1, byte[] city2) {
         return true;
     }
 
-    private record FileSegment(long start, long end) {
+    private static final class FileSegment {
+        long start;
+        long end;
+
+        private FileSegment(long start, long end) {
+            this.start = start;
+            this.end = end;
+        }
     }
 
     private static final class Measurement {
 
-        private int hash;
-        private byte[] city;
+        private final int hash;
+        private final byte[] city;
 
         int min;
         int max;
@@ -158,45 +202,28 @@ public String toString() {
         }
     }
 
-    private static List<FileSegment> getFileSegments(final File file) throws IOException {
+    private static Stream<FileSegment> getFileSegments(final File file) throws IOException {
         final int numberOfSegments = Runtime.getRuntime().availableProcessors() * 4;
-        final long fileSize = file.length();
-        final long segmentSize = fileSize / numberOfSegments;
-        if (segmentSize < 1000) {
-            return List.of(new FileSegment(0, fileSize));
-        }
-
-        try (RandomAccessFile randomAccessFile = new RandomAccessFile(file, "r")) {
-            int lastSegment = numberOfSegments - 1;
-            return IntStream.range(0, numberOfSegments)
-                    .mapToObj(
-                            i -> {
-                                long segStart = i * segmentSize;
-                                long segEnd = (i == lastSegment) ? fileSize : segStart + segmentSize;
-                                try {
-                                    segStart = findSegment(i, 0, randomAccessFile, segStart, segEnd);
-                                    segEnd = findSegment(i, lastSegment, randomAccessFile, segEnd, fileSize);
-                                }
-                                catch (IOException e) {
-                                    throw new RuntimeException(e);
-                                }
-                                return new FileSegment(segStart, segEnd);
-                            })
-                    .toList();
-        }
-    }
-
-    private static long findSegment(
-                                    final int i, final int skipSegment, RandomAccessFile raf, long location, final long fileSize)
-            throws IOException {
-        if (i != skipSegment) {
-            raf.seek(location);
-            while (location < fileSize) {
-                location++;
-                if (raf.read() == '\n')
-                    return location;
+        final long[] chunks = new long[numberOfSegments + 1];
+        try (var fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ)) {
+            final long fileSize = fileChannel.size();
+            final long segmentSize = (fileSize + numberOfSegments - 1) / numberOfSegments;
+            final long mappedAddress = fileChannel.map(MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
+            chunks[0] = mappedAddress;
+            final long endAddress = mappedAddress + fileSize;
+            for (int i = 1; i < numberOfSegments; ++i) {
+                long chunkAddress = mappedAddress + i * segmentSize;
+                // Align to first row start.
+                while (chunkAddress < endAddress && UNSAFE.getByte(chunkAddress++) != '\n') {
+                    // nop
+                }
+                chunks[i] = Math.min(chunkAddress, endAddress);
             }
+            chunks[numberOfSegments] = endAddress;
         }
-        return location;
+        return IntStream.range(0, chunks.length - 1)
+                .mapToObj(chunkIndex -> new FileSegment(chunks[chunkIndex], chunks[chunkIndex + 1]))
+                .parallel();
     }
+
 }

From 7bd2df7c590773a497e1c67f86b8f7c91173e657 Mon Sep 17 00:00:00 2001
From: Arman Sharif <armandino@gmail.com>
Date: Tue, 16 Jan 2024 13:04:37 -0800
Subject: [PATCH 038/268] armandino: second attempt (#445)

---
 calculate_average_armandino.sh                |   2 +-
 .../onebrc/CalculateAverage_armandino.java    | 377 +++++++++++-------
 2 files changed, 225 insertions(+), 154 deletions(-)

diff --git a/calculate_average_armandino.sh b/calculate_average_armandino.sh
index 719953d52..6ac5c1654 100755
--- a/calculate_average_armandino.sh
+++ b/calculate_average_armandino.sh
@@ -16,5 +16,5 @@
 #
 
 
-JAVA_OPTS=""
+JAVA_OPTS="--enable-preview -da -dsa -Xms128m -Xmx128m -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_armandino
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java b/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
index 21abbb193..dce3a3302 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
@@ -15,188 +15,143 @@
  */
 package dev.morling.onebrc;
 
+import sun.misc.Unsafe;
+
 import java.io.IOException;
 import java.io.PrintStream;
-import java.nio.ByteBuffer;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
 import java.nio.channels.FileChannel;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Objects;
+import java.util.TreeMap;
+import java.util.stream.Stream;
 
 import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
 import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.stream.Collectors.toMap;
 
 public class CalculateAverage_armandino {
 
-    private static final String FILE = "./measurements.txt";
+    private static final Path FILE = Path.of("./measurements.txt");
 
-    private static final int MAX_KEY_LENGTH = 100;
+    private static final int NUM_CHUNKS = Math.max(8, Runtime.getRuntime().availableProcessors());
+    private static final int INITIAL_MAP_CAPACITY = 8192;
     private static final byte SEMICOLON = 59;
     private static final byte NL = 10;
     private static final byte DOT = 46;
     private static final byte MINUS = 45;
+    private static final byte ZERO_DIGIT = 48;
+    private static final Unsafe UNSAFE = getUnsafe();
 
     public static void main(String[] args) throws Exception {
-        Aggregator aggregator = new Aggregator();
-        aggregator.process();
-        aggregator.printStats();
-    }
-
-    private static class Aggregator {
-
-        private final Map<Integer, Stats> map = new ConcurrentHashMap<>(2048);
-
-        private record Chunk(long start, long end) {
-        }
+        var channel = FileChannel.open(FILE, StandardOpenOption.READ);
 
-        void process() throws Exception {
-            var channel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
-            final Chunk[] chunks = split(channel);
-            final Thread[] threads = new Thread[chunks.length];
+        var results = Arrays.stream(split(channel)).parallel()
+                .map(chunk -> new ChunkProcessor().process(chunk.start, chunk.end))
+                .flatMap(SimpleMap::stream)
+                .collect(toMap(Stats::getKey, s -> s, CalculateAverage_armandino::mergeStats, TreeMap::new));
 
-            for (int i = 0; i < chunks.length; i++) {
-                final Chunk chunk = chunks[i];
-
-                threads[i] = Thread.ofVirtual().start(() -> {
-                    try {
-                        var bb = channel.map(READ_ONLY, chunk.start, chunk.end - chunk.start);
-                        process(bb);
-                    }
-                    catch (IOException e) {
-                        throw new RuntimeException(e);
-                    }
-                });
-            }
-
-            for (Thread t : threads) {
-                t.join();
-            }
-        }
+        print(results.values());
+    }
 
-        private static Chunk[] split(final FileChannel channel) throws IOException {
-            final long fileSize = channel.size();
-            if (fileSize < 10000) {
-                return new Chunk[]{ new Chunk(0, fileSize) };
-            }
+    private static Stats mergeStats(final Stats x, final Stats y) {
+        x.min = Math.min(x.min, y.min);
+        x.max = Math.max(x.max, y.max);
+        x.count += y.count;
+        x.sum += y.sum;
+        return x;
+    }
 
-            final int numChunks = 8;
-            final long chunkSize = fileSize / numChunks;
-            final var chunks = new Chunk[numChunks];
+    private static class ChunkProcessor {
+        private final SimpleMap map = new SimpleMap(INITIAL_MAP_CAPACITY);
 
-            for (int i = 0; i < numChunks; i++) {
-                long start = 0;
-                long end = chunkSize;
+        private SimpleMap process(final long chunkStart, final long chunkEnd) {
+            long i = chunkStart;
+            while (i < chunkEnd) {
+                final long keyAddress = i;
+                int keyHash = 0;
+                int measurement = 0;
+                byte b;
 
-                if (i > 0) {
-                    start = chunks[i - 1].end + 1;
-                    end = Math.min(start + chunkSize, fileSize);
+                while ((b = UNSAFE.getByte(i++)) != SEMICOLON) {
+                    keyHash = 31 * keyHash + b;
                 }
 
-                end = end == fileSize ? end : seekNextNewline(channel, end);
-                chunks[i] = new Chunk(start, end);
-            }
-            return chunks;
-        }
+                final int keyLength = (int) (i - keyAddress - 1);
 
-        private static long seekNextNewline(final FileChannel channel, final long end) throws IOException {
-            var bb = ByteBuffer.allocate(MAX_KEY_LENGTH);
-            channel.position(end).read(bb);
-
-            for (int i = 0; i < bb.limit(); i++) {
-                if (bb.get(i) == NL) {
-                    return end + i;
-                }
-            }
-
-            throw new IllegalStateException("Couldn't find next newline");
-        }
-
-        private void process(final ByteBuffer bb) {
-            final var sample = new Sample();
-            var isKey = true;
-
-            for (long i = 0, sz = bb.limit(); i < sz; i++) {
-
-                final byte b = bb.get();
+                if ((b = UNSAFE.getByte(i++)) == MINUS) {
+                    while ((b = UNSAFE.getByte(i++)) != DOT) {
+                        measurement = measurement * 10 + b - ZERO_DIGIT;
+                    }
 
-                if (b == SEMICOLON) {
-                    isKey = false;
-                }
-                else if (b == NL) {
-                    isKey = true;
-                    addSample(sample);
-                    sample.reset();
-                }
-                else if (isKey) {
-                    sample.pushKey(b);
-                }
-                else if (b == DOT) {
-                    // skip
-                }
-                else if (b == MINUS) {
-                    sample.sign = -1;
+                    b = UNSAFE.getByte(i);
+                    measurement = measurement * 10 + b - ZERO_DIGIT;
+                    measurement = -measurement;
+                    i += 2;
                 }
                 else {
-                    sample.pushMeasurement(b);
-                }
-            }
-        }
+                    measurement = b - ZERO_DIGIT; // D1
+                    b = UNSAFE.getByte(i); // dot or D2
 
-        private void addSample(final Sample sample) {
-            final Stats stats = map.computeIfAbsent(sample.keyHash,
-                    k -> new Stats(new String(sample.keyBytes, 0, sample.keyLength, UTF_8)));
-
-            final var val = sample.getMeasurement();
-
-            if (val < stats.min)
-                stats.min = val;
-
-            if (val > stats.max)
-                stats.max = val;
-
-            stats.sum += val;
-            stats.count++;
-        }
-
-        void printStats() {
-            var sorted = new ArrayList<>(map.values());
-            Collections.sort(sorted);
-
-            int size = sorted.size();
-
-            System.out.print('{');
-
-            for (Stats stats : sorted) {
-                stats.print(System.out);
-                if (--size > 0) {
-                    System.out.print(", ");
+                    if (b == DOT) {
+                        measurement = measurement * 10 + UNSAFE.getByte(i + 1) - ZERO_DIGIT; // F
+                        i += 3;
+                    }
+                    else {
+                        measurement = measurement * 10 + b - ZERO_DIGIT; // D2
+                        measurement = measurement * 10 + UNSAFE.getByte(i + 2) - ZERO_DIGIT; // F
+                        i += 4; // skip NL
+                    }
                 }
+
+                final Stats stats = map.putStats(keyHash, keyAddress, keyLength);
+                stats.min = Math.min(stats.min, measurement);
+                stats.max = Math.max(stats.max, measurement);
+                stats.sum += measurement;
+                stats.count++;
             }
-            System.out.println('}');
+            return map;
         }
     }
 
     private static class Stats implements Comparable<Stats> {
-        private final String city;
+        private String key;
+        private final byte[] keyBytes;
+        private final int keyLength;
+        private final int keyHash;
         private int min = Integer.MAX_VALUE;
         private int max = Integer.MIN_VALUE;
-        private long sum;
         private int count;
+        private long sum;
+
+        private Stats(long keyAddress, int keyLength, int keyHash) {
+            this.keyLength = keyLength;
+            this.keyBytes = new byte[keyLength];
+            this.keyHash = keyHash;
+
+            for (int i = 0; i < keyLength; i++) {
+                keyBytes[i] = UNSAFE.getByte(keyAddress++);
+            }
+        }
 
-        private Stats(String city) {
-            this.city = city;
+        String getKey() {
+            if (key == null) {
+                key = new String(keyBytes, 0, keyLength, UTF_8);
+            }
+            return key;
         }
 
         @Override
         public int compareTo(final Stats o) {
-            return city.compareTo(o.city);
+            return getKey().compareTo(o.getKey());
         }
 
         void print(final PrintStream out) {
-            out.print(city);
+            out.print(key);
             out.print('=');
             out.print(round(min / 10f));
             out.print('/');
@@ -210,32 +165,148 @@ private static double round(double value) {
         }
     }
 
-    private static class Sample {
-        private final byte[] keyBytes = new byte[MAX_KEY_LENGTH];
-        private int keyLength;
-        private int keyHash;
-        private int measurement;
-        private int sign = 1;
+    private static void print(final Collection<Stats> sorted) {
+        int size = sorted.size();
+        System.out.print('{');
+        for (Stats stats : sorted) {
+            stats.print(System.out);
+            if (--size > 0) {
+                System.out.print(", ");
+            }
+        }
+        System.out.println('}');
+    }
 
-        void pushKey(byte b) {
-            keyBytes[keyLength++] = b;
-            keyHash = 31 * keyHash + b;
+    private static Chunk[] split(final FileChannel channel) throws IOException {
+        final long fileSize = channel.size();
+        long start = channel.map(READ_ONLY, 0, fileSize, Arena.global()).address();
+        final long endAddress = start + fileSize;
+        if (fileSize < 10000) {
+            return new Chunk[]{ new Chunk(start, endAddress) };
         }
 
-        void pushMeasurement(byte b) {
-            final int i = b - '0';
-            measurement = measurement * 10 + i;
+        final long chunkSize = fileSize / NUM_CHUNKS;
+        final var chunks = new Chunk[NUM_CHUNKS];
+        long end = start + chunkSize;
+
+        for (int i = 0; i < NUM_CHUNKS; i++) {
+            if (i > 0) {
+                start = chunks[i - 1].end;
+                end = Math.min(start + chunkSize, endAddress);
+            }
+            if (end < endAddress) {
+                while (UNSAFE.getByte(end) != NL) {
+                    end++;
+                }
+                end++;
+            }
+            chunks[i] = new Chunk(start, end);
         }
+        return chunks;
+    }
+
+    private record Chunk(long start, long end) {
+    }
+
+    private static Unsafe getUnsafe() {
+        try {
+            Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            unsafe.setAccessible(true);
+            return (Unsafe) unsafe.get(null);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static class SimpleMap {
+        private Stats[] table;
 
-        int getMeasurement() {
-            return sign * measurement;
+        SimpleMap(int initialCapacity) {
+            table = new Stats[initialCapacity];
+        }
+
+        Stream<Stats> stream() {
+            return Arrays.stream(table).filter(Objects::nonNull);
+        }
+
+        private void resize() {
+            var copy = new SimpleMap(table.length * 2);
+            for (Stats s : table) {
+                if (s != null) {
+                    final int pos = (copy.table.length - 1) & s.keyHash;
+                    int i = pos;
+
+                    if (copy.table[i] == null) {
+                        copy.table[i] = s;
+                        continue;
+                    }
+
+                    while (i < copy.table.length && copy.table[i] != null) {
+                        i++;
+                    }
+                    if (i == copy.table.length) {
+                        i = pos;
+                        while (i >= 0 && copy.table[i] != null) {
+                            i--;
+                        }
+                    }
+                    if (i < 0) {
+                        // shouldn't happen because put() is called after increasing size
+                        throw new IllegalStateException("table is full");
+                    }
+                    copy.table[i] = s;
+                }
+            }
+            table = copy.table;
+        }
+
+        Stats putStats(final int keyHash, final long keyAddress, final int keyLength) {
+            final int pos = (table.length - 1) & keyHash;
+
+            Stats stats = table[pos];
+            if (stats == null)
+                return createAt(table, keyAddress, keyLength, keyHash, pos);
+            if (stats.keyHash == keyHash && keysEqual(stats, keyAddress, keyLength))
+                return stats;
+
+            int i = pos;
+            while (++i < table.length) {
+                stats = table[i];
+                if (stats == null)
+                    return createAt(table, keyAddress, keyLength, keyHash, i);
+                if (keyHash == stats.keyHash && keysEqual(stats, keyAddress, keyLength))
+                    return stats;
+            }
+
+            i = pos;
+            while (i-- > 0) {
+                stats = table[i];
+                if (stats == null)
+                    return createAt(table, keyAddress, keyLength, keyHash, i);
+                if (keyHash == stats.keyHash && keysEqual(stats, keyAddress, keyLength))
+                    return stats;
+            }
+            resize();
+            return putStats(keyHash, keyAddress, keyLength);
+        }
+
+        private boolean keysEqual(Stats stats, long keyAddress, final int keyLength) {
+            if (stats.keyLength != keyLength) {
+                return false;
+            }
+            for (int i = 0; i < keyLength; i++) {
+                if (stats.keyBytes[i] != UNSAFE.getByte(keyAddress++)) {
+                    return false;
+                }
+            }
+            return true;
         }
 
-        void reset() {
-            keyHash = 0;
-            keyLength = 0;
-            measurement = 0;
-            sign = 1;
+        private static Stats createAt(Stats[] table, long keyAddress, int keyLength, int key, int i) {
+            Stats stats = new Stats(keyAddress, keyLength, key);
+            table[i] = stats;
+            return stats;
         }
     }
 }

From e4b717e1a4cb6ef9f2a6cbd2265a7411aa0f5ebf Mon Sep 17 00:00:00 2001
From: Anthony Goubard <anthony.goubard@japplis.com>
Date: Tue, 16 Jan 2024 22:10:38 +0100
Subject: [PATCH 039/268] Read file in multiple threads and String to Text
 (#427)

* - Read file in multiple threads if available: 17" -> 15" locally
- Changed String to BytesText with cache: 12" locally

* - Fixed bug
- BytesText to Text
- More checks when reading the file

* - Combining measurements should be thread safe
- More readability changes
---
 calculate_average_japplis.sh                  |   2 +-
 .../onebrc/CalculateAverage_japplis.java      | 258 ++++++++++++++----
 2 files changed, 199 insertions(+), 61 deletions(-)

diff --git a/calculate_average_japplis.sh b/calculate_average_japplis.sh
index 47ba3e7e0..38a59786f 100755
--- a/calculate_average_japplis.sh
+++ b/calculate_average_japplis.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="-Xmx2G"
+JAVA_OPTS=""
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_japplis $*
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java b/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java
index fb386bfa5..36eb0172c 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java
@@ -41,7 +41,9 @@
  * - Replaced compute lambda call with synchronized(city.intern()): 43" (due to intern())
  * - Removed BufferedInputStream and replaced Measurement with IntSummaryStatistics (thanks davecom): still 23" but cleaner code
  * - Execute same code on 1BRC server: 41"
- * - One HashMap per thread: 17" locally
+ * - One HashMap per thread: 17" locally (12" on 1BRC server)
+ * - Read file in multiple threads if available and
+ * - Changed String to (byte[]) Text with cache: 18" locally (but 8" -> 5" on laptop)
  *
  * @author Anthony Goubard - Japplis
  */
@@ -53,63 +55,112 @@ public class CalculateAverage_japplis {
 
     private int precision = -1;
     private int precisionLimitTenth;
-
-    private Map<String, IntSummaryStatistics> cityMeasurementMap = new ConcurrentHashMap<>();
+    private long fileSize;
+    private Map<Text, IntSummaryStatistics> cityMeasurementMap = new ConcurrentHashMap<>(10_000);
     private List<Byte> previousBlockLastLine = new ArrayList<>();
-
     private Semaphore readFileLock = new Semaphore(MAX_COMPUTE_THREADS);
+    private Queue<ByteArray> bufferPool = new ConcurrentLinkedQueue<>();
 
     private void parseTemperatures(File measurementsFile) throws Exception {
-        try (InputStream measurementsFileIS = new FileInputStream(measurementsFile)) {
-            int readCount = BUFFER_SIZE;
-            ExecutorService threadPool = Executors.newFixedThreadPool(MAX_COMPUTE_THREADS);
-            List<Future> parseBlockTasks = new ArrayList<>();
-            while (readCount > 0) {
-                byte[] buffer = new byte[BUFFER_SIZE];
-                readCount = measurementsFileIS.read(buffer);
-                if (readCount > 0) {
-                    readFileLock.acquire(); // Wait if all threads are busy
+        fileSize = measurementsFile.length();
+        int blockIndex = 0;
+        int totalBlocks = (int) (fileSize / BUFFER_SIZE) + 1;
+        ExecutorService threadPool = Executors.newFixedThreadPool(MAX_COMPUTE_THREADS);
+        List<Future> parseBlockTasks = new ArrayList<>();
 
-                    // Process the block in a thread while the main thread continues to read the file
-                    Future parseBlockTask = threadPool.submit(parseTemperaturesBlock(buffer, readCount));
+        while (blockIndex < totalBlocks) {
+            int availableReadThreads = Math.min(readFileLock.availablePermits(), totalBlocks - blockIndex);
+            if (availableReadThreads == 0) {
+                readFileLock.acquire(); // No need to loop in the 'while' if all threads are busy
+                readFileLock.release();
+            }
+            List<Future<ByteArray>> readBlockTasks = new ArrayList<>();
+            for (int i = 0; i < availableReadThreads; i++) {
+                readFileLock.acquire(); // Wait if all threads are busy
+                Callable<ByteArray> blockReader = readBlock(measurementsFile, blockIndex);
+                Future<ByteArray> readBlockTask = threadPool.submit(blockReader);
+                readBlockTasks.add(readBlockTask);
+                blockIndex++;
+            }
+            for (Future<ByteArray> readBlockTask : readBlockTasks) {
+                ByteArray buffer = readBlockTask.get();
+                if (buffer.array().length > 0) {
+                    int startIndex = handleSplitLine(buffer.array());
+                    readFileLock.acquire(); // Wait if all threads are busy
+                    Runnable blockParser = parseTemperaturesBlock(buffer, startIndex);
+                    Future parseBlockTask = threadPool.submit(blockParser);
                     parseBlockTasks.add(parseBlockTask);
                 }
             }
-            for (Future parseBlockTask : parseBlockTasks) // Wait for all tasks to finish
-                parseBlockTask.get();
-            threadPool.shutdownNow();
         }
+        for (Future parseBlockTask : parseBlockTasks) { // Wait for all tasks to finish
+            parseBlockTask.get();
+        }
+        threadPool.shutdownNow();
     }
 
-    private Runnable parseTemperaturesBlock(byte[] buffer, int readCount) {
-        int startIndex = handleSplitLine(buffer, readCount);
+    private Callable<ByteArray> readBlock(File measurementsFile, long blockIndex) {
+        return () -> {
+            long fileIndex = blockIndex * BUFFER_SIZE;
+            if (fileIndex >= fileSize) {
+                readFileLock.release();
+                return new ByteArray(0);
+            }
+            try (InputStream measurementsFileIS = new FileInputStream(measurementsFile)) {
+                if (fileIndex > 0) {
+                    long skipped = measurementsFileIS.skip(fileIndex);
+                    while (skipped != fileIndex) {
+                        skipped += measurementsFileIS.skip(fileIndex - skipped);
+                    }
+                }
+                long bufferSize = Math.min(BUFFER_SIZE, fileSize - fileIndex);
+                ByteArray buffer = bufferSize == BUFFER_SIZE ? bufferPool.poll() : new ByteArray((int) bufferSize);
+                if (buffer == null) {
+                    buffer = new ByteArray(BUFFER_SIZE);
+                }
+                int totalRead = measurementsFileIS.read(buffer.array(), 0, (int) bufferSize);
+                while (totalRead < bufferSize) {
+                    byte[] extraBuffer = new byte[(int) (bufferSize - totalRead)];
+                    int readCount = measurementsFileIS.read(extraBuffer);
+                    System.arraycopy(extraBuffer, 0, buffer.array(), totalRead, readCount);
+                    totalRead += readCount;
+                }
+                readFileLock.release();
+                return buffer;
+            }
+        };
+    }
+
+    private Runnable parseTemperaturesBlock(ByteArray buffer, int startIndex) {
         Runnable countAverageRun = () -> {
             int bufferIndex = startIndex;
-            Map<String, IntSummaryStatistics> blockCityMeasurementMap = new HashMap<>();
+            Map<Text, IntSummaryStatistics> blockCityMeasurementMap = new HashMap<>(10_000);
+            Map<Integer, Text> textPool = new HashMap<>(10_000);
+            byte[] bufferArray = buffer.array();
             try {
-                while (bufferIndex < readCount) {
-                    bufferIndex = readNextLine(bufferIndex, buffer, blockCityMeasurementMap);
+                while (bufferIndex < bufferArray.length) {
+                    bufferIndex = readNextLine(bufferIndex, bufferArray, blockCityMeasurementMap, textPool);
                 }
             }
             catch (ArrayIndexOutOfBoundsException ex) {
                 // Done reading and parsing the buffer
             }
+            if (bufferArray.length == BUFFER_SIZE)
+                bufferPool.add(buffer);
             mergeBlockResults(blockCityMeasurementMap);
             readFileLock.release();
         };
         return countAverageRun;
     }
 
-    private int handleSplitLine(byte[] buffer, int readCount) {
+    private int handleSplitLine(byte[] buffer) {
         int bufferIndex = readFirstLines(buffer);
-        List<Byte> lastLine = new ArrayList<>(); // Store the last (partial) line of the block
-        int tailIndex = readCount;
-        if (tailIndex == buffer.length) {
-            byte car = buffer[--tailIndex];
-            while (car != '\n') {
-                lastLine.add(0, car);
-                car = buffer[--tailIndex];
-            }
+        List<Byte> lastLine = new ArrayList<>(100); // Store the last (partial) line of the block
+        int tailIndex = buffer.length;
+        byte car = buffer[--tailIndex];
+        while (car != '\n') {
+            lastLine.add(0, car);
+            car = buffer[--tailIndex];
         }
         if (previousBlockLastLine.isEmpty()) {
             previousBlockLastLine = lastLine;
@@ -132,7 +183,7 @@ private int readSplitLine(byte[] buffer) {
         for (int i = 0; i < splitLineBytes.length; i++) {
             splitLineBytes[i] = previousBlockLastLine.get(i);
         }
-        readNextLine(0, splitLineBytes, cityMeasurementMap);
+        readNextLine(0, splitLineBytes, cityMeasurementMap, new HashMap<>());
         return bufferIndex;
     }
 
@@ -148,8 +199,9 @@ private int readFirstLines(byte[] buffer) {
         int dotPos = bufferIndex;
         byte car = buffer[bufferIndex++];
         while (car != '\n') {
-            if (car == '.')
+            if (car == '.') {
                 dotPos = bufferIndex;
+            }
             car = buffer[bufferIndex++];
         }
         precision = bufferIndex - dotPos - 1;
@@ -158,40 +210,47 @@ private int readFirstLines(byte[] buffer) {
         return startIndex;
     }
 
-    private int readNextLine(int bufferIndex, byte[] buffer, Map<String, IntSummaryStatistics> blockCityMeasurementMap) {
+    private int readNextLine(int bufferIndex, byte[] buffer, Map<Text, IntSummaryStatistics> blockCityMeasurementMap, Map<Integer, Text> textPool) {
         int startLineIndex = bufferIndex;
-        while (buffer[bufferIndex] != ';')
+        while (buffer[bufferIndex] != (byte) ';') {
             bufferIndex++;
-        String city = new String(buffer, startLineIndex, bufferIndex - startLineIndex, StandardCharsets.UTF_8);
+        }
+        // String city = new String(buffer, startLineIndex, bufferIndex - startLineIndex, StandardCharsets.UTF_8);
+        Text city = Text.getByteText(buffer, startLineIndex, bufferIndex - startLineIndex, textPool);
         bufferIndex++; // skip ';'
         int temperature = readTemperature(buffer, bufferIndex);
         bufferIndex += precision + 3; // digit, dot and CR
-        if (temperature < 0)
+        if (temperature < 0) {
             bufferIndex++;
-        if (temperature <= -precisionLimitTenth || temperature >= precisionLimitTenth)
+        }
+        if (temperature <= -precisionLimitTenth || temperature >= precisionLimitTenth) {
             bufferIndex++;
+        }
         addTemperature(city, temperature, blockCityMeasurementMap);
         return bufferIndex;
     }
 
-    private int readTemperature(byte[] text, int measurementIndex) {
-        boolean negative = text[measurementIndex] == '-';
-        if (negative)
-            measurementIndex++;
-        byte digitChar = text[measurementIndex++];
+    private int readTemperature(byte[] buffer, int bufferIndex) {
+        boolean negative = buffer[bufferIndex] == (byte) '-';
+        if (negative) {
+            bufferIndex++;
+        }
+        byte digit = buffer[bufferIndex++];
         int temperature = 0;
-        while (digitChar != '\n') {
-            temperature = temperature * 10 + (digitChar - '0');
-            digitChar = text[measurementIndex++];
-            if (digitChar == '.')
-                digitChar = text[measurementIndex++];
+        while (digit != (byte) '\n') {
+            temperature = temperature * 10 + (digit - (byte) '0');
+            digit = buffer[bufferIndex++];
+            if (digit == (byte) '.') { // Skip '.'
+                digit = buffer[bufferIndex++];
+            }
         }
-        if (negative)
+        if (negative) {
             temperature = -temperature;
+        }
         return temperature;
     }
 
-    private void addTemperature(String city, int temperature, Map<String, IntSummaryStatistics> blockCityMeasurementMap) {
+    private void addTemperature(Text city, int temperature, Map<Text, IntSummaryStatistics> blockCityMeasurementMap) {
         IntSummaryStatistics measurement = blockCityMeasurementMap.get(city);
         if (measurement == null) {
             measurement = new IntSummaryStatistics();
@@ -200,16 +259,20 @@ private void addTemperature(String city, int temperature, Map<String, IntSummary
         measurement.accept(temperature);
     }
 
-    private void mergeBlockResults(Map<String, IntSummaryStatistics> blockCityMeasurementMap) {
+    private void mergeBlockResults(Map<Text, IntSummaryStatistics> blockCityMeasurementMap) {
         blockCityMeasurementMap.forEach((city, measurement) -> {
-            IntSummaryStatistics oldMeasurement = cityMeasurementMap.putIfAbsent(city, measurement);
-            if (oldMeasurement != null)
-                oldMeasurement.combine(measurement);
+            cityMeasurementMap.compute(city, (town, currentMeasurement) -> {
+                if (currentMeasurement == null) {
+                    return measurement;
+                }
+                currentMeasurement.combine(measurement);
+                return currentMeasurement;
+            });
         });
     }
 
     private void printTemperatureStatsByCity() {
-        Set<String> sortedCities = new TreeSet<>(cityMeasurementMap.keySet());
+        Set<Text> sortedCities = new TreeSet<>(cityMeasurementMap.keySet());
         StringBuilder result = new StringBuilder(cityMeasurementMap.size() * 40);
         result.append('{');
         sortedCities.forEach(city -> {
@@ -217,7 +280,9 @@ private void printTemperatureStatsByCity() {
             result.append(city);
             result.append(getTemperatureStats(measurement));
         });
-        result.delete(result.length() - 2, result.length());
+        if (!sortedCities.isEmpty()) {
+            result.delete(result.length() - 2, result.length());
+        }
         result.append('}');
         String temperaturesByCity = result.toString();
         System.out.println(temperaturesByCity);
@@ -242,9 +307,10 @@ private void appendTemperature(StringBuilder resultBuilder, int temperature) {
         for (int i = temperatureAsText.length(); i < minCharacters; i++) {
             temperatureAsText = temperature < 0 ? "-0" + temperatureAsText.substring(1) : "0" + temperatureAsText;
         }
-        resultBuilder.append(temperatureAsText.substring(0, temperatureAsText.length() - precision));
+        int dotPosition = temperatureAsText.length() - precision;
+        resultBuilder.append(temperatureAsText.substring(0, dotPosition));
         resultBuilder.append('.');
-        resultBuilder.append(temperatureAsText.substring(temperatureAsText.length() - precision));
+        resultBuilder.append(temperatureAsText.substring(dotPosition));
     }
 
     public static final void main(String... args) throws Exception {
@@ -253,4 +319,76 @@ public static final void main(String... args) throws Exception {
         cityTemperaturesCalculator.parseTemperatures(new File(measurementFile));
         cityTemperaturesCalculator.printTemperatureStatsByCity();
     }
-}
+
+    private class ByteArray {
+
+        private byte[] array;
+
+        private ByteArray(int size) {
+            array = new byte[size];
+        }
+
+        private byte[] array() {
+            return array;
+        }
+    }
+
+    private static class Text implements Comparable<Text> {
+
+        private final byte[] textBytes;
+        private final int hash;
+        private String text;
+
+        private Text(byte[] buffer, int startIndex, int length, int hash) {
+            textBytes = new byte[length];
+            this.hash = hash;
+            System.arraycopy(buffer, startIndex, textBytes, 0, length);
+        }
+
+        private static Text getByteText(byte[] buffer, int startIndex, int length, Map<Integer, Text> textPool) {
+            int hash = hashCode(buffer, startIndex, length);
+            Text textFromPool = textPool.get(hash);
+            if (textFromPool == null || !Arrays.equals(buffer, startIndex, startIndex + length, textFromPool.textBytes, 0, length)) {
+                Text newText = new Text(buffer, startIndex, length, hash);
+                textPool.put(hash, newText);
+                return newText;
+            }
+            return textFromPool;
+        }
+
+        private static int hashCode(byte[] buffer, int startIndex, int length) {
+            int hash = 31;
+            int endIndex = startIndex + length;
+            for (int i = startIndex; i < endIndex; i++) {
+                hash = 31 * hash + buffer[i];
+            }
+            return hash;
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
+        }
+
+        @Override
+        public boolean equals(Object other) {
+            return other != null &&
+                    hashCode() == other.hashCode() &&
+                    other instanceof Text &&
+                    Arrays.equals(textBytes, ((Text) other).textBytes);
+        }
+
+        @Override
+        public int compareTo(Text other) {
+            return toString().compareTo(other.toString());
+        }
+
+        @Override
+        public String toString() {
+            if (text == null) {
+                text = new String(textBytes, StandardCharsets.UTF_8);
+            }
+            return text;
+        }
+    }
+}
\ No newline at end of file

From 576291611d94cfffb8e4bc89a0c426867d9ff491 Mon Sep 17 00:00:00 2001
From: adri <adria.cabezasantanna@datadoghq.com>
Date: Tue, 16 Jan 2024 22:23:35 +0100
Subject: [PATCH 040/268] Memory mapped buffers, ints instead of floats and
 epsilon GC (#451)

* Modify baseline version to improve performance

- Consume and process stream in parallel with memory map buffers, parsing it directly
- Use int instead of float/double to store values
- Use Epsilon GC and graal

* Update src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java

* Update calculate_average_adriacabeza.sh

---------

Co-authored-by: Gunnar Morling <gunnar.morling@googlemail.com>
---
 calculate_average_adriacabeza.sh              |  22 ++
 prepare_adriacabeza.sh                        |  19 ++
 .../onebrc/CalculateAverage_adriacabeza.java  | 223 ++++++++++++++++++
 3 files changed, 264 insertions(+)
 create mode 100755 calculate_average_adriacabeza.sh
 create mode 100755 prepare_adriacabeza.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java

diff --git a/calculate_average_adriacabeza.sh b/calculate_average_adriacabeza.sh
new file mode 100755
index 000000000..984ab03d6
--- /dev/null
+++ b/calculate_average_adriacabeza.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# sdk use java 21.0.1-graal 1>&2
+
+
+JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC"
+java --enable-preview  -classpath target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_adriacabeza
+
diff --git a/prepare_adriacabeza.sh b/prepare_adriacabeza.sh
new file mode 100755
index 000000000..f83a3ff69
--- /dev/null
+++ b/prepare_adriacabeza.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
new file mode 100644
index 000000000..56a2c4d7b
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
@@ -0,0 +1,223 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * This class calculates average measurements from a file in a parallelized manner.
+ */
+public class CalculateAverage_adriacabeza {
+
+    private static final Path FILE_PATH = Paths.get("./measurements.txt");
+    public static final int CITY_NAME_MAX_CHARACTERS = 128;
+
+    /**
+     * Represents result containing a HashMap with city as key and ResultRow as value.
+     */
+    private static class Result {
+        private static class StationData {
+            private int min, sum, count, max;
+
+            public StationData(int value) {
+                this.count = 1;
+                this.sum = value;
+                this.min = value;
+                this.max = value;
+            }
+
+            public void update(int value) {
+                this.count++;
+                this.sum += value;
+                this.min = Math.min(this.min, value);
+                this.max = Math.max(this.max, value);
+            }
+
+            public String toString() {
+                return "%.1f/%.1f/%.1f".formatted(min / 10.0, sum / 10.0 / count, max / 10.0);
+            }
+
+        }
+        private final Map<String, StationData> resultMap;
+
+        public Result() {
+            this.resultMap = new HashMap<>();
+        }
+
+        public Map<String, StationData> getResultMap() {
+            return resultMap;
+        }
+
+        public void addMeasurement(String city, int value) {
+            resultMap.compute(city, (_, resultRow) -> {
+                if (resultRow == null) {
+                    return new StationData(value);
+                } else {
+                    resultRow.update(value);
+                    return resultRow;
+                }
+            });
+        }
+
+        public void merge(Result other) {
+            other.getResultMap().forEach((city, resultRow) ->
+                    resultMap.merge(city, resultRow, (existing, incoming) -> {
+                        existing.min = Math.min(existing.min, incoming.min);
+                        existing.max = Math.max(existing.max, incoming.max);
+                        existing.sum += incoming.sum;
+                        existing.count += incoming.count;
+                        return existing;
+                    }));
+        }
+
+        public String toString() {
+            return this.resultMap.entrySet().stream()
+                    .sorted(Map.Entry.comparingByKey())
+                    .map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
+                    .collect(Collectors.joining(", ", "{", "}"));
+        }
+    }
+
+    /**
+     * Finds the ending position in the file, ensuring it ends at the beginning of a line.
+     *
+     * @param channel  File channel
+     * @param position Current position in the file
+     * @return Ending position at the beginning of a line
+     * @throws IOException If an I/O error occurs
+     */
+    private static long findEndPosition(FileChannel channel, long position) throws IOException {
+        ByteBuffer buffer = ByteBuffer.allocate(1);
+
+        // Iterate over the file from the given position to find the next newline character
+        while (position < channel.size()) {
+            channel.read(buffer, position);
+
+            // Check if the current byte is a newline character
+            if (buffer.get(0) == '\n') {
+                return position + 1; // Return the position immediately after the newline
+            }
+
+            position++;
+            buffer.clear();
+        }
+
+        return channel.size(); // Return the end of the file if no newline is found after the current position
+    }
+
+
+    /**
+     * Gets the mapped byte buffers for parallel processing.
+     *
+     * @param nProcessors Number of processors for parallelization
+     * @return List of MappedByteBuffers
+     * @throws IOException If an I/O error occurs
+     */
+    private static List<MappedByteBuffer> getMappedByteBuffers(int nProcessors) throws IOException {
+        try (FileChannel channel = FileChannel.open(FILE_PATH, StandardOpenOption.READ)) {
+            long fileSize = channel.size();
+            long chunkSize = (fileSize + nProcessors - 1) / nProcessors;
+            long pos = 0;
+
+            List<MappedByteBuffer> buffers = new ArrayList<>(nProcessors);
+            for (int i = 0; i < nProcessors; i++) {
+                long endPosition = findEndPosition(channel, pos + chunkSize);
+                long size = endPosition - pos;
+                MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, pos, size);
+                pos = pos + size;
+                buffers.add(buffer);
+            }
+            return buffers;
+        }
+    }
+
+    /**
+     * Calculates average measurements from the file.
+     *
+     * @return Result containing min/mean/max values for each city
+     */
+    private static Result calculateAverageMeasurements(List<MappedByteBuffer> chunks) {
+        // Process each buffer in parallel
+        return chunks.parallelStream()
+                .map(buffer -> {
+                    Result partialResult = new Result();
+                    var limit = buffer.limit();
+                    var field = new byte[CITY_NAME_MAX_CHARACTERS];
+                    while (buffer.position() < limit) {
+                        var fieldCurrentIndex = 0;
+                        field[fieldCurrentIndex++] = buffer.get();
+                        while (buffer.position() < limit) {
+                            var fieldByte = buffer.get();
+                            if (fieldByte == ';')
+                                break;
+                            field[fieldCurrentIndex++] = fieldByte;
+                        }
+                        var fieldStr = new String(field, 0, fieldCurrentIndex);
+                        var number = 0;
+                        var sign = 1;
+                        while (buffer.position() < limit) {
+                            var numberByte = buffer.get();
+                            if (numberByte == '-')
+                                sign = -1;
+                            else if (numberByte == '\n')
+                                break;
+                            else if (numberByte != '.')
+                                number = number * 10 + (numberByte - '0');
+                        }
+                        partialResult.addMeasurement(fieldStr, sign * number);
+                    }
+                    return partialResult;
+                }).reduce(new Result(), (partialResult1, partialResult2) -> {
+                    Result result = new Result();
+                    result.merge(partialResult1);
+                    result.merge(partialResult2);
+                    return result;
+                });
+    }
+
+    /**
+     * The main method to run the average measurements calculations program.
+     *
+     * @param args Command line arguments. Not utilized in this program.
+     */
+    public static void main(String[] args) {
+        try {
+            // Get the MappedByteBuffers by splitting the file evenly across available processors
+            var buffers = getMappedByteBuffers(Runtime.getRuntime().availableProcessors());
+
+            // Calculate the average measurements from the buffers obtained
+            var measurements = calculateAverageMeasurements(buffers);
+
+            // Print the measurements result to the console.
+            System.out.println(measurements);
+
+        } catch (IOException e) {
+            // Handle any potential I/O exceptions by printing the error message to the console
+            System.err.println(STR."Error processing file: \{e.getMessage()}");
+        }
+    }
+}

From 1804fc5b5f48c20c26c6e12a7a34315796f2fae3 Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Wed, 17 Jan 2024 06:31:00 +0900
Subject: [PATCH 041/268] Native build, less memory acess, improved hash mixing
 (#449)

---
 calculate_average_abeobk.sh                   |  11 +-
 prepare_abeobk.sh                             |  25 ++
 .../onebrc/CalculateAverage_abeobk.java       | 257 ++++++++++++------
 3 files changed, 213 insertions(+), 80 deletions(-)
 create mode 100755 prepare_abeobk.sh

diff --git a/calculate_average_abeobk.sh b/calculate_average_abeobk.sh
index a7b43d404..18c4c9448 100755
--- a/calculate_average_abeobk.sh
+++ b/calculate_average_abeobk.sh
@@ -15,5 +15,12 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_abeobk
+if [ -f target/CalculateAverage_abeobk_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_abeobk_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_abeobk_image
+else
+    JAVA_OPTS="--enable-preview"
+    echo "Chosing to run the app in JVM mode as no native image was found, use prepare_abeobk.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_abeobk
+fi
+
diff --git a/prepare_abeobk.sh b/prepare_abeobk.sh
new file mode 100755
index 000000000..bf2b7b51e
--- /dev/null
+++ b/prepare_abeobk.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_abeobk_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index 34a5552a5..ec6c9e5ba 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -24,11 +24,12 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
 import java.util.TreeMap;
 import sun.misc.Unsafe;
 
 public class CalculateAverage_abeobk {
-    private static final boolean SHOW_COLLISIONS = false;
+    private static final boolean SHOW_ANALYSIS = false;
 
     private static final String FILE = "./measurements.txt";
     private static final int BUCKET_SIZE = 1 << 16;
@@ -99,13 +100,13 @@ void merge(Node other) {
         boolean contentEquals(long other_addr, long other_tail) {
             if (tail != other_tail) // compare tail & length at the same time
                 return false;
-            long my_addr = addr;
-            int nl = (int) (tail >> 59);
-            for (int i = 0; i < nl; i++, my_addr += 8, other_addr += 8) {
-                if (UNSAFE.getLong(my_addr) != UNSAFE.getLong(other_addr))
-                    return false;
+            // this is faster than comparision if key is short
+            long xsum = 0;
+            int n = ((int) (tail >>> 56)) & 0xF8;
+            for (int i = 0; i < n; i += 8) {
+                xsum |= (UNSAFE.getLong(addr + i) ^ UNSAFE.getLong(other_addr + i));
             }
-            return true;
+            return xsum == 0;
         }
     }
 
@@ -123,6 +124,7 @@ static long[] slice(long start_addr, long end_addr, long chunk_size, int cpu_cnt
         return ptrs;
     }
 
+    // idea from royvanrijn
     static final long getSemiPosCode(final long word) {
         long xor_semi = word ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
         return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
@@ -133,17 +135,164 @@ static final long getSemiPosCode(final long word) {
     // zero collision on test data
     static final int xxh32(long hash) {
         final int p1 = 0x85EBCA77; // prime
-        final int p2 = 0xC2B2AE3D; // prime
+        final int p2 = 0x165667B1; // prime
         int low = (int) hash;
-        int high = (int) (hash >>> 32);
-        low ^= low >> 15;
-        low *= p1;
-        high ^= high >> 13;
-        high *= p2;
-        var h = low ^ high;
+        int high = (int) (hash >>> 31);
+        int h = low + high;
+        h ^= h >> 15;
+        h *= p1;
+        h ^= h >> 13;
+        h *= p2;
+        h ^= h >> 11;
         return h;
     }
 
+    // great idea from merykitty (Quan Anh Mai)
+    static final int parseNum(long num_word, int dot_pos) {
+        int shift = 28 - dot_pos;
+        long signed = (~num_word << 59) >> 63;
+        long dsmask = ~(signed & 0xFF);
+        long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
+        long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+        return (int) ((abs_val ^ signed) - signed);
+    }
+
+    // optimize for contest
+    // save as much slow memory access as possible
+    // about 50% key < 8chars, 25% key bettween 8-10 chars
+    // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
+    static final Node[] parse(int thread_id, long start, long end, int[] cls) {
+        long addr = start;
+        var map = new Node[BUCKET_SIZE + 10000]; // extra space for collisions
+        // parse loop
+        while (addr < end) {
+            long row_addr = addr;
+            long tail = 0;
+            long hash = 0;
+            int val = 0;
+            int bucket = 0;
+
+            long word = UNSAFE.getLong(addr);
+            long semipos_code = getSemiPosCode(word);
+
+            // about 50% chance key < 8 chars
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                addr += semi_pos;
+                tail = (word & HASH_MASKS[semi_pos]);
+                bucket = xxh32(tail) & BUCKET_MASK;
+                long keylen = (addr - row_addr);
+                tail |= (keylen << 56);
+                long num_word = UNSAFE.getLong(++addr);
+                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                val = parseNum(num_word, dot_pos);
+                addr += (dot_pos >>> 3) + 3;
+
+                while (true) {
+                    var node = map[bucket];
+                    if (node == null) {
+                        map[bucket] = new Node(row_addr, tail, val);
+                        break;
+                    }
+                    if (node.tail == tail) {
+                        node.add(val);
+                        break;
+                    }
+                    bucket++;
+                    if (SHOW_ANALYSIS)
+                        cls[thread_id]++;
+                }
+                continue;
+            }
+
+            hash ^= word;
+            addr += 8;
+            word = UNSAFE.getLong(addr);
+            semipos_code = getSemiPosCode(word);
+            // frist byte semicolon ~13%
+            if (semipos_code == 0x80) {
+                bucket = xxh32(hash) & BUCKET_MASK;
+                tail = 8L << 56;
+                long num_word = word >>> 8;
+                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                val = parseNum(num_word, dot_pos);
+                addr += (dot_pos >>> 3) + 4;
+
+                while (true) {
+                    var node = map[bucket];
+                    if (node == null) {
+                        map[bucket] = new Node(row_addr, tail, val);
+                        break;
+                    }
+                    if (UNSAFE.getLong(node.addr) == UNSAFE.getLong(row_addr)) {
+                        node.add(val);
+                        break;
+                    }
+                    bucket++;
+                    if (SHOW_ANALYSIS)
+                        cls[thread_id]++;
+                }
+                continue;
+            }
+
+            while (semipos_code == 0) {
+                hash ^= word;
+                addr += 8;
+                word = UNSAFE.getLong(addr);
+                semipos_code = getSemiPosCode(word);
+            }
+
+            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+            addr += semi_pos;
+            tail = (word & HASH_MASKS[semi_pos]);
+            hash ^= tail;
+            bucket = xxh32(hash) & BUCKET_MASK;
+            long keylen = (addr - row_addr);
+            tail |= (keylen << 56);
+
+            ++addr;
+            long num_word = UNSAFE.getLong(addr);
+            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+            val = parseNum(num_word, dot_pos);
+            addr += (dot_pos >>> 3) + 3;
+
+            if (keylen < 16) {
+                while (true) {
+                    var node = map[bucket];
+                    if (node == null) {
+                        map[bucket] = new Node(row_addr, tail, val);
+                        break;
+                    }
+                    if (node.tail == tail && (UNSAFE.getLong(node.addr) == UNSAFE.getLong(row_addr))) {
+                        node.add(val);
+                        break;
+                    }
+                    bucket++;
+                    if (SHOW_ANALYSIS)
+                        cls[thread_id]++;
+                }
+                continue;
+            }
+
+            // longer key
+            while (true) {
+                var node = map[bucket];
+                if (node == null) {
+                    map[bucket] = new Node(row_addr, tail, val);
+                    break;
+                }
+                if (node.contentEquals(row_addr, tail)) {
+                    node.add(val);
+                    break;
+                }
+                bucket++;
+                if (SHOW_ANALYSIS)
+                    cls[thread_id]++;
+            }
+        }
+        return map;
+    }
+
     public static void main(String[] args) throws InterruptedException, IOException {
         try (var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
             long start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
@@ -158,71 +307,14 @@ public static void main(String[] args) throws InterruptedException, IOException
             var threads = new Thread[cpu_cnt];
             var maps = new Node[cpu_cnt][];
             var ptrs = slice(start_addr, end_addr, chunk_size, cpu_cnt);
-            int[] cls = new int[cpu_cnt];
+
+            int[] cls = new int[cpu_cnt]; // collision
+            int[] lenhist = new int[64]; // length histogram
 
             for (int i = 0; i < cpu_cnt; i++) {
                 int thread_id = i;
-                long start = ptrs[i];
-                long end = ptrs[i + 1];
-                maps[i] = new Node[BUCKET_SIZE + 10000]; // extra space for collisions
-
-                (threads[i] = new Thread(() -> {
-                    long addr = start;
-                    var map = maps[thread_id];
-                    // parse loop
-                    while (addr < end) {
-                        long hash = 0;
-                        long word = 0;
-                        long row_addr = addr;
-                        int semi_pos = 8;
-                        word = UNSAFE.getLong(addr);
-                        long semipos_code = getSemiPosCode(word);
-
-                        while (semipos_code == 0) {
-                            hash ^= word;
-                            addr += 8;
-                            word = UNSAFE.getLong(addr);
-                            semipos_code = getSemiPosCode(word);
-                        }
-
-                        semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                        long tail = word & HASH_MASKS[semi_pos];
-                        hash ^= tail;
-                        addr += semi_pos;
-
-                        int hash32 = xxh32(hash);
-                        long keylen = (addr - row_addr);
-                        tail = tail | (keylen << 56);
-
-                        addr++;
-
-                        // great idea from merykitty (Quan Anh Mai)
-                        long num_word = UNSAFE.getLong(addr);
-                        int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                        addr += (dot_pos >>> 3) + 3;
-                        int shift = 28 - dot_pos;
-                        long signed = (~num_word << 59) >> 63;
-                        long dsmask = ~(signed & 0xFF);
-                        long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
-                        long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-                        int val = (int) ((abs_val ^ signed) - signed);
-
-                        int bucket = (hash32 & BUCKET_MASK);
-                        while (true) {
-                            var node = map[bucket];
-                            if (node == null) {
-                                map[bucket] = new Node(row_addr, tail, val);
-                                break;
-                            }
-                            if (node.contentEquals(row_addr, tail)) {
-                                node.add(val);
-                                break;
-                            }
-                            bucket++;
-                            if (SHOW_COLLISIONS)
-                                cls[thread_id]++;
-                        }
-                    }
+                (threads[thread_id] = new Thread(() -> {
+                    maps[thread_id] = parse(thread_id, ptrs[thread_id], ptrs[thread_id + 1], cls);
                 })).start();
             }
 
@@ -230,7 +322,7 @@ public static void main(String[] args) throws InterruptedException, IOException
             for (var thread : threads)
                 thread.join();
 
-            if (SHOW_COLLISIONS) {
+            if (SHOW_ANALYSIS) {
                 for (int i = 0; i < cpu_cnt; i++) {
                     System.out.println("thread-" + i + " collision = " + cls[i]);
                 }
@@ -242,13 +334,22 @@ public static void main(String[] args) throws InterruptedException, IOException
                 for (var node : map) {
                     if (node == null)
                         continue;
+                    if (SHOW_ANALYSIS) {
+                        int kl = (int) (node.tail >>> 56) & (lenhist.length - 1);
+                        lenhist[kl] += node.count;
+                    }
                     var stat = ms.putIfAbsent(node.key(), node);
                     if (stat != null)
                         stat.merge(node);
                 }
             }
 
-            if (!SHOW_COLLISIONS)
+            if (SHOW_ANALYSIS) {
+                System.out.println("total=" + Arrays.stream(lenhist).sum());
+                System.out.println("length_histogram = "
+                        + Arrays.toString(Arrays.stream(lenhist).map(x -> (int) (x * 1.0e-7)).toArray()));
+            }
+            else
                 System.out.println(ms);
         }
     }

From 7ed5e1b0d4885b04fe09728ac979171aef45fc91 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Tue, 16 Jan 2024 22:31:34 +0100
Subject: [PATCH 042/268] Leaderboard, formatting

---
 README.md                                     | 11 +++++-----
 calculate_average_adriacabeza.sh              |  1 -
 .../onebrc/CalculateAverage_adriacabeza.java  | 20 +++++++++----------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 17a98fb88..76f10337b 100644
--- a/README.md
+++ b/README.md
@@ -43,9 +43,9 @@ These are the results from running all entries into the challenge on eight cores
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
 | 1* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
-| 3 | 00:02.621 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
+| 3* | 00:02.602 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
+|   | 00:02.692 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
 |   | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
-|   | 00:02.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
 |   | 00:03.409 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
@@ -66,6 +66,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) |  |
+|   | 00:06.415 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) |  |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
@@ -79,20 +80,22 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
 |   | 00:08.489 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnabyl.java)| 21.0.1-graal | [Bang NGUYEN](https://github.com/gnabyl) |  |
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) |  |
+|   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
 |   | 00:08.689 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:08.752 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
 |   | 00:08.892 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fatroom.java)| 21.0.1-open | [Roman Romanchuk](https://github.com/fatroom) |  |
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
 |   | 00:09.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielreid.java)| 21.0.1-open | [Gabriel Reid](https://github.com/gabrielreid) |  |
-|   | 00:09.117 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
 |   | 00:09.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java)| 21.0.1-graal | [Filip Hrisafov](https://github.com/filiphr) |  |
 |   | 00:09.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
+|   | 00:09.945 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
 |   | 00:10.092 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-graal | [Pratham](https://github.com/phd3) |  |
 |   | 00:10.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
 |   | 00:10.553 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) |  |
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
 |   | 00:11.119 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_lawrey.java)| 21.0.1-open | [lawrey](https://github.com/lawrey) |  |
 |   | 00:11.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_palmr.java)| 21.0.1-open | [Nick Palmer](https://github.com/palmr) |  |
+|   | 00:11.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [adri](https://github.com/adriacabeza) |  |
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
@@ -100,7 +103,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.051 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
-|   | 00:12.565 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.763 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
@@ -134,7 +136,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:38.340 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_AbstractKamen.java)| 21.0.1-open | [AbstractKamen](https://github.com/AbstractKamen) |  |
 |   | 00:41.982 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_criccomini.java)| 21.0.1-open | [Chris Riccomini](https://github.com/criccomini) |  |
 |   | 00:42.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_javamak.java)| 21.0.1-open | [javamak](https://github.com/javamak) |  |
-|   | 00:45.447 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) |  |
 |   | 00:46.597 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_maeda6uiui.java)| 21.0.1-open | [Maeda-san](https://github.com/maeda6uiui) |  |
 |   | 00:58.811 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Ujjwalbharti.java)| 21.0.1-open | [Ujjwal Bharti](https://github.com/Ujjwalbharti) |  |
 |   | 01:05.094 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mudit-saxena.java)| 21.0.1-open | [Mudit Saxena](https://github.com/mudit-saxena) |  |
diff --git a/calculate_average_adriacabeza.sh b/calculate_average_adriacabeza.sh
index 984ab03d6..e2c655701 100755
--- a/calculate_average_adriacabeza.sh
+++ b/calculate_average_adriacabeza.sh
@@ -14,7 +14,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-# sdk use java 21.0.1-graal 1>&2
 
 
 JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
index 56a2c4d7b..a1a6953dc 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
@@ -62,6 +62,7 @@ public String toString() {
             }
 
         }
+
         private final Map<String, StationData> resultMap;
 
         public Result() {
@@ -76,7 +77,8 @@ public void addMeasurement(String city, int value) {
             resultMap.compute(city, (_, resultRow) -> {
                 if (resultRow == null) {
                     return new StationData(value);
-                } else {
+                }
+                else {
                     resultRow.update(value);
                     return resultRow;
                 }
@@ -84,14 +86,13 @@ public void addMeasurement(String city, int value) {
         }
 
         public void merge(Result other) {
-            other.getResultMap().forEach((city, resultRow) ->
-                    resultMap.merge(city, resultRow, (existing, incoming) -> {
-                        existing.min = Math.min(existing.min, incoming.min);
-                        existing.max = Math.max(existing.max, incoming.max);
-                        existing.sum += incoming.sum;
-                        existing.count += incoming.count;
-                        return existing;
-                    }));
+            other.getResultMap().forEach((city, resultRow) -> resultMap.merge(city, resultRow, (existing, incoming) -> {
+                existing.min = Math.min(existing.min, incoming.min);
+                existing.max = Math.max(existing.max, incoming.max);
+                existing.sum += incoming.sum;
+                existing.count += incoming.count;
+                return existing;
+            }));
         }
 
         public String toString() {
@@ -129,7 +130,6 @@ private static long findEndPosition(FileChannel channel, long position) throws I
         return channel.size(); // Return the end of the file if no newline is found after the current position
     }
 
-
     /**
      * Gets the mapped byte buffers for parallel processing.
      *

From ffb09bf4bf0b41835b3340415be4f3c34565c126 Mon Sep 17 00:00:00 2001
From: Peter Levart <peter.levart@gmail.com>
Date: Tue, 16 Jan 2024 22:34:40 +0100
Subject: [PATCH 043/268] plevart: Look Mom No Unsafe! (#452)

---
 calculate_average_plevart.sh                  |  22 +
 prepare_plevart.sh                            |  19 +
 .../onebrc/CalculateAverage_plevart.java      | 405 ++++++++++++++++++
 3 files changed, 446 insertions(+)
 create mode 100755 calculate_average_plevart.sh
 create mode 100755 prepare_plevart.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java

diff --git a/calculate_average_plevart.sh b/calculate_average_plevart.sh
new file mode 100755
index 000000000..be195ac08
--- /dev/null
+++ b/calculate_average_plevart.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector"
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation"
+JAVA_OPTS="$JAVA_OPTS -XX:InlineSmallCode=15000 -XX:FreqInlineSize=400 -XX:MaxInlineSize=400"
+#JAVA_OPTS="$JAVA_OPTS -XX:+PrintCompilation -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_plevart $*
diff --git a/prepare_plevart.sh b/prepare_plevart.sh
new file mode 100755
index 000000000..d2a3c6ba1
--- /dev/null
+++ b/prepare_plevart.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-tem 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java b/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java
new file mode 100644
index 000000000..fd42d454f
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java
@@ -0,0 +1,405 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorOperators;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.Comparator;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+
+public class CalculateAverage_plevart {
+    private static final Path FILE = Path.of("measurements.txt");
+
+    private static final int MAX_CITY_LEN = 100;
+    // 100 (city name) + 1 (;) + 5 (-99.9) + 1 (NL)
+    private static final int MAX_LINE_LEN = MAX_CITY_LEN + 7;
+
+    private static final int INITIAL_TABLE_CAPACITY = 8192;
+
+    public static void main(String[] args) throws IOException {
+        var arena = Arena.global();
+        try (
+                var channel = (FileChannel) Files.newByteChannel(FILE, StandardOpenOption.READ)) {
+            var segment = channel.map(FileChannel.MapMode.READ_ONLY, 0, Files.size(FILE), arena);
+            int regions = Runtime.getRuntime().availableProcessors();
+            IntStream
+                    .range(0, regions)
+                    .parallel()
+                    .mapToObj(r -> calculateRegion(segment, regions, r))
+                    .reduce(StatsTable::reduce)
+                    .ifPresent(System.out::println);
+            segment.unload();
+        }
+    }
+
+    private static StatsTable calculateRegion(MemorySegment segment, int regions, int r) {
+        long start = (segment.byteSize() * r) / regions;
+        long end = (segment.byteSize() * (r + 1)) / regions;
+        if (r > 0) {
+            start = skipPastNl(segment, start);
+        }
+        if (r + 1 < regions) {
+            end = skipPastNl(segment, end);
+        }
+
+        var stats = new StatsTable(segment, INITIAL_TABLE_CAPACITY);
+        calculateAdjustedRegion(segment, start, end, stats);
+        return stats;
+    }
+
+    private static long skipPastNl(MemorySegment segment, long i) {
+        int skipped = 0;
+        while (skipped++ < MAX_LINE_LEN && getByte(segment, i++) != '\n') {
+        }
+        if (skipped > MAX_LINE_LEN) {
+            throw new IllegalArgumentException(
+                    "Encountered line that exceeds " + MAX_LINE_LEN + " bytes at offset: " + i);
+        }
+        return i;
+    }
+
+    private static void calculateAdjustedRegion(MemorySegment segment, long start, long end, StatsTable stats) {
+        var species = ByteVector.SPECIES_PREFERRED;
+        long speciesByteSize = species.vectorByteSize();
+
+        long cityStart = start, numberStart = 0;
+        int cityLen = 0;
+
+        for (long i = start, j = i; i < end; j = i) {
+            long semiNlSet;
+            if (end - i >= speciesByteSize) {
+                var vec = ByteVector.fromMemorySegment(species, segment, i, ByteOrder.nativeOrder());
+                semiNlSet = vec.compare(VectorOperators.EQ, (byte) ';')
+                        .or(vec.compare(VectorOperators.EQ, (byte) '\n'))
+                        .toLong();
+                i += speciesByteSize;
+            }
+            else { // tail, smaller than speciesByteSize
+                semiNlSet = 0;
+                long mask = 1;
+                while (i < end && mask != 0) {
+                    int c = getByte(segment, i++);
+                    if (c == '\n' || c == ';') {
+                        semiNlSet |= mask;
+                    }
+                    mask <<= 1;
+                }
+            }
+
+            for (int step = Long.numberOfTrailingZeros(semiNlSet); step < 64; semiNlSet >>>= (step + 1), step = Long.numberOfTrailingZeros(semiNlSet)) {
+                j += step;
+                if (numberStart == 0) { // semi
+                    cityLen = (int) (j - cityStart);
+                    numberStart = ++j;
+                }
+                else { // nl
+                    int numberLen = (int) (j - numberStart);
+                    calculateEntry(segment, cityStart, cityLen, numberStart, numberLen, stats);
+                    cityStart = ++j;
+                    numberStart = 0;
+                }
+            }
+        }
+    }
+
+    private static void calculateEntry(MemorySegment segment, long cityStart, int cityLen, long numberStart, int numberLen, StatsTable stats) {
+        int hash = StatsTable.hash(segment, cityStart, cityLen);
+        int number = parseNumber(segment, numberStart, numberLen);
+        stats.aggregate(cityStart, cityLen, hash, 1, number, number, number);
+    }
+
+    private static int parseNumber(MemorySegment segment, long off, int len) {
+        int c0 = getByte(segment, off);
+        int d0;
+        int sign;
+        if (c0 == '-') {
+            off++;
+            len--;
+            d0 = getByte(segment, off) - '0';
+            sign = -1;
+        } else {
+            d0 = c0 - '0';
+            sign = 1;
+        }
+        return sign * switch (len) {
+            case 1 -> d0 * 10;                  // 9
+            case 2 -> {
+                int d1 = getByte(segment, off + 1) - '0';
+                yield d0 * 100 + d1 * 10;       // 99
+            }
+            case 3 -> {
+                int d2 = getByte(segment, off + 2) - '0';
+                yield d0 * 10 + d2;             // 9.9
+            }
+            case 4 -> {
+                int d1 = getByte(segment, off + 1) - '0';
+                int d3 = getByte(segment, off + 3) - '0';
+                yield d0 * 100 + d1 * 10 + d3;  // 99.9
+            }
+            default -> {
+                throw new IllegalArgumentException("Invalid number: " + getString(segment, off, len));
+            }
+        };
+    }
+
+    private static int getByte(MemorySegment segment, long off) {
+        return segment.get(ValueLayout.JAVA_BYTE, off);
+    }
+
+    private static String getString(MemorySegment segment, long off, int len) {
+        return new String(segment.asSlice(off, len).toArray(ValueLayout.JAVA_BYTE), StandardCharsets.UTF_8);
+    }
+
+    final static class StatsTable implements Cloneable {
+        private static final int LOAD_FACTOR = 16;
+        // offsets of fields
+        private static final int _lenHash = 0,
+                _off = 1,
+                _count = 2,
+                _sum = 3,
+                _min = 4,
+                _max = 5;
+        private final MemorySegment segment;
+        private int pow2cap, loadedSize;
+        private long[] table;
+
+        StatsTable(MemorySegment segment, int capacity) {
+            this.segment = segment;
+            int pow2cap = Integer.highestOneBit(capacity);
+            if (pow2cap < capacity) {
+                pow2cap <<= 1;
+            }
+            this.pow2cap = pow2cap;
+            this.table = new long[idx(pow2cap)];
+        }
+
+        private static int idx(int i) {
+            return i << 3;
+        }
+
+        private static long lenHash(int len, int hash) {
+            return ((long) len << 32) | ((long) hash & 0x00000000FFFFFFFFL);
+        }
+
+        private static int len(long lenHash) {
+            return (int) (lenHash >>> 32);
+        }
+
+        private static int hash(long lenHash) {
+            return (int) (lenHash & 0x00000000FFFFFFFFL);
+        }
+
+        private static final long[] LEN_LONG_MASK;
+        private static final int[] LEN_INT_MASK;
+
+        static {
+            LEN_LONG_MASK = new long[Long.BYTES + 1];
+            for (int len = 0; len <= Long.BYTES; len++) {
+                LEN_LONG_MASK[len] = len == 0
+                        ? 0L
+                        : ValueLayout.JAVA_LONG_UNALIGNED.order() == ByteOrder.LITTLE_ENDIAN
+                                ? -1L >>> ((Long.BYTES - len) * Byte.SIZE)
+                                : -1L << ((Long.BYTES - len) * Byte.SIZE);
+            }
+            LEN_INT_MASK = new int[Integer.BYTES + 1];
+            for (int len = 0; len <= Integer.BYTES; len++) {
+                LEN_INT_MASK[len] = len == 0
+                        ? 0
+                        : ValueLayout.JAVA_LONG_UNALIGNED.order() == ByteOrder.LITTLE_ENDIAN
+                                ? -1 >>> ((Integer.BYTES - len) * Byte.SIZE)
+                                : -1 << ((Integer.BYTES - len) * Byte.SIZE);
+            }
+        }
+
+        static int hash(MemorySegment segment, long off, int len) {
+            if (len > Integer.BYTES) {
+                int head = segment.get(ValueLayout.JAVA_INT_UNALIGNED, off);
+                int tail = segment.get(ValueLayout.JAVA_INT_UNALIGNED, off + len - Integer.BYTES);
+                return (head * 31) ^ tail;
+            }
+            else {
+                // assert len >= 0 && len <= 4;
+                // each city name starts at least 4 bytes before segment end
+                // assert off + Integer.BYTES <= segment.byteSize();
+                return segment.get(ValueLayout.JAVA_INT_UNALIGNED, off) & LEN_INT_MASK[len];
+            }
+        }
+
+        static boolean equals(MemorySegment segment, long off1, long off2, int len) {
+            while (len >= Long.BYTES) {
+                if (segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off1) != segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off2)) {
+                    return false;
+                }
+                off1 += Long.BYTES;
+                off2 += Long.BYTES;
+                len -= Long.BYTES;
+            }
+            // still enough memory to compare two longs, but masked?
+            if (Math.max(off1, off2) + Long.BYTES <= segment.byteSize()) {
+                long mask = LEN_LONG_MASK[len];
+                return (segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off1) & mask) == (segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off2) & mask);
+            }
+            else {
+                return equalsAtBorder(segment, off1, off2, len);
+            }
+        }
+
+        private static boolean equalsAtBorder(MemorySegment segment, long off1, long off2, int len) {
+            if (len > Integer.BYTES) {
+                if (segment.get(ValueLayout.JAVA_INT_UNALIGNED, off1) != segment.get(ValueLayout.JAVA_INT_UNALIGNED, off2)) {
+                    return false;
+                }
+                len -= Integer.BYTES;
+                off1 += Integer.BYTES;
+                off2 += Integer.BYTES;
+            }
+            // assert len >= 0 && len <= 4;
+            // each city name starts at least 4 bytes before segment end
+            // assert Math.max(off1, off2) + Integer.BYTES <= segment.byteSize();
+            int mask = LEN_INT_MASK[len];
+            return (segment.get(ValueLayout.JAVA_INT_UNALIGNED, off1) & mask) == (segment.get(ValueLayout.JAVA_INT_UNALIGNED, off2) & mask);
+        }
+
+        void aggregate(
+                       // key
+                       long off, int len, int hash,
+                       // value
+                       long count, long sum, long min, long max) {
+            long lenHash = lenHash(len, hash);
+            int mask = pow2cap - 1;
+            for (int i = hash & mask, probe = 0; probe < pow2cap; i = (i + 1) & mask, probe++) {
+                int idx = idx(i);
+                long lenHash_i = table[idx + _lenHash];
+                if (lenHash_i == 0) {
+                    table[idx + _lenHash] = lenHash;
+                    table[idx + _off] = off;
+                    table[idx + _count] = count;
+                    table[idx + _sum] = sum;
+                    table[idx + _min] = min;
+                    table[idx + _max] = max;
+                    loadedSize += LOAD_FACTOR;
+                    if (loadedSize >= pow2cap) {
+                        grow();
+                    }
+                    return;
+                }
+                if (lenHash_i == lenHash && equals(segment, table[idx + _off], off, len)) {
+                    table[idx + _count] += count;
+                    table[idx + _sum] += sum;
+                    table[idx + _min] = Math.min(min, table[idx + _min]);
+                    table[idx + _max] = Math.max(max, table[idx + _max]);
+                    return;
+                }
+            }
+            throw new OutOfMemoryError("StatsTable capacity exceeded due to poor hash");
+        }
+
+        private void grow() {
+            if (idx(pow2cap) >= 0x4000_0000) {
+                throw new OutOfMemoryError("StatsTable capacity exceeded");
+            }
+            else {
+                var oldStats = clone();
+                pow2cap <<= 1;
+                table = new long[idx(pow2cap)];
+                loadedSize = 0;
+                reduce(oldStats);
+            }
+        }
+
+        @Override
+        protected StatsTable clone() {
+            try {
+                return (StatsTable) super.clone();
+            }
+            catch (CloneNotSupportedException e) {
+                throw new InternalError(e);
+            }
+        }
+
+        StatsTable reduce(StatsTable other) {
+            other
+                    .idxStream()
+                    .forEach(
+                            idx -> aggregate(
+                                    other.table[idx + _off],
+                                    len(other.table[idx + _lenHash]),
+                                    hash(other.table[idx + _lenHash]),
+                                    other.table[idx + _count],
+                                    other.table[idx + _sum],
+                                    other.table[idx + _min],
+                                    other.table[idx + _max]));
+            return this;
+        }
+
+        IntStream idxStream() {
+            return IntStream
+                    .range(0, pow2cap)
+                    .map(StatsTable::idx)
+                    .filter(idx -> table[idx + _lenHash] != 0);
+        }
+
+        Stream<Entry> stream() {
+            return idxStream()
+                    .mapToObj(
+                            idx -> new Entry(
+                                    new String(
+                                            segment
+                                                    .asSlice(table[idx + _off], len(table[idx + _lenHash]))
+                                                    .toArray(ValueLayout.JAVA_BYTE),
+                                            StandardCharsets.UTF_8),
+                                    table[idx + _count],
+                                    table[idx + _sum],
+                                    table[idx + _min],
+                                    table[idx + _max]));
+        }
+
+        @Override
+        public String toString() {
+            return stream()
+                    .sorted(Comparator.comparing(StatsTable.Entry::city))
+                    .map(Entry::toString)
+                    .collect(Collectors.joining(", ", "{", "}"));
+        }
+
+        record Entry(String city, long count, long sum, long min, long max) {
+            double average() {
+                return count > 0L ? (double) sum / (double) count : 0d;
+            }
+
+            @Override
+            public String toString() {
+                return String.format(
+                    "%s=%.1f/%.1f/%.1f",
+                    city(), (double) min() / 10d, average() / 10d, (double) max() / 10d
+                );
+            }
+        }
+    }
+}
\ No newline at end of file

From 455b85c5af1cba9ddb6e6dc686c091d1e000a432 Mon Sep 17 00:00:00 2001
From: karthikeyan97 <skarthikeyan046@gmail.com>
Date: Wed, 17 Jan 2024 03:16:11 +0530
Subject: [PATCH 044/268] karthikeyan97 implementation (#417)

Co-authored-by: Karthikeyans <karthikeyan.sn@zohocorp.com>
---
 calculate_average_karthikeyan97.sh            |  19 +
 .../CalculateAverage_karthikeyan97.java       | 382 ++++++++++++++++++
 2 files changed, 401 insertions(+)
 create mode 100755 calculate_average_karthikeyan97.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java

diff --git a/calculate_average_karthikeyan97.sh b/calculate_average_karthikeyan97.sh
new file mode 100755
index 000000000..a6bd728d1
--- /dev/null
+++ b/calculate_average_karthikeyan97.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xms20480m -Xmx40960m "
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_karthikeyan97
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java b/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
new file mode 100644
index 000000000..c17e92797
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
@@ -0,0 +1,382 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import static java.util.stream.Collectors.*;
+
+import java.io.FileInputStream;
+
+import java.io.RandomAccessFile;
+import java.lang.reflect.Field;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.function.BiConsumer;
+import java.util.function.BinaryOperator;
+import java.util.function.Function;
+import java.util.function.Supplier;
+import java.util.stream.Collector;
+import java.util.stream.Collectors;
+
+public class CalculateAverage_karthikeyan97 {
+
+    private static final String FILE = "./measurements.txt";
+
+    private record Measurement(modifiedbytearray station, double value) {
+    }
+
+    private record customPair(String stationName, MeasurementAggregator agg) {
+    }
+
+    private static class MeasurementAggregator {
+        private double min = Double.POSITIVE_INFINITY;
+        private double max = Double.NEGATIVE_INFINITY;
+        private long sum;
+        private long count;
+
+        public String toString() {
+            return new StringBuffer(14)
+                    .append(round(min))
+                    .append("/")
+                    .append(round((1.0 * sum) / count))
+                    .append("/")
+                    .append(round(max)).toString();
+        }
+
+        private double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        // long start = System.nanoTime();
+        System.setSecurityManager(null);
+        Collector<Map.Entry<modifiedbytearray, MeasurementAggregator>, MeasurementAggregator, MeasurementAggregator> collector = Collector.of(
+                MeasurementAggregator::new,
+                (a, m) -> {
+                    MeasurementAggregator agg = m.getValue();
+                    if (a.min >= agg.min) {
+                        a.min = agg.min;
+                    }
+                    if (a.max <= agg.max) {
+                        a.max = agg.max;
+                    }
+                    a.max = Math.max(a.max, m.getValue().max);
+                    a.sum += m.getValue().sum;
+                    a.count += m.getValue().count;
+                },
+                (agg1, agg2) -> {
+                    if (agg1.min <= agg2.min) {
+                        agg2.min = agg1.min;
+                    }
+                    if (agg1.max >= agg2.max) {
+                        agg2.max = agg1.max;
+                    }
+                    agg2.sum = agg1.sum + agg2.sum;
+                    agg2.count = agg1.count + agg2.count;
+
+                    return agg2;
+                },
+                agg -> agg);
+
+        RandomAccessFile raf = new RandomAccessFile(FILE, "rw");
+        long length = raf.length();
+        int cores = length > 1000 ? Runtime.getRuntime().availableProcessors() : 1;
+        long boundary[][] = new long[cores][2];
+        long segments = length / (cores);
+        long before = -1;
+        for (int i = 0; i < cores - 1; i++) {
+            boundary[i][0] = before + 1;
+            byte[] b = new byte[107];
+            if (before + segments - 107 > 0) {
+                raf.seek(before + segments - 107);
+            }
+            else {
+                raf.seek(0);
+            }
+            while (raf.read() != '\n') {
+            }
+            boundary[i][1] = raf.getChannel().position() - 1;
+            before = boundary[i][1];
+        }
+        boundary[cores - 1][0] = before + 1;
+        boundary[cores - 1][1] = length - 1;
+
+        Field f = Unsafe.class.getDeclaredField("theUnsafe");
+        f.setAccessible(true);
+        Unsafe unsafe = (Unsafe) f.get(null);
+
+        int pageSize = unsafe.pageSize() * 10;
+
+        System.out.println(new TreeMap((Arrays.stream(boundary).parallel().map(i -> {
+            FileInputStream fileInputStream = null;
+            try {
+                fileInputStream = new FileInputStream(FILE);
+                FileChannel fileChannel = fileInputStream.getChannel();
+                HashMap<modifiedbytearray, MeasurementAggregator> resultmap = new HashMap<>(12000, 100);
+
+                ByteBuffer buffer = ByteBuffer.allocateDirect(pageSize);
+
+                fileChannel.position(i[0]);
+                int bytesReading = 0;
+                double num = 0;
+                int sign = 1;
+                boolean isNumber = false;
+                byte bi;
+                modifiedbytearray stationName = null;
+                int hascode = 1;
+                int ctr = 0;
+                byte[] arr = new byte[100];
+                int arrptr = 0;
+                int seglen = (int) (i[1] - i[0] + 1);
+                while (bytesReading < seglen) {
+                    buffer.clear();
+                    int bytesRead = fileChannel.read(buffer);
+                    if ((bytesReading + bytesRead) <= seglen) {
+                        if (bytesRead < 0) {
+                            bytesRead = 0;
+                        }
+                    }
+                    else {
+                        bytesRead = (seglen - bytesReading);
+                    }
+                    buffer.flip();
+                    int bytesptr = 0;
+                    byte[] bufferArr = new byte[bytesRead];
+                    buffer.get(bufferArr);
+                    while (bytesptr < bytesRead) {
+                        bytesReading += 1;
+                        bi = bufferArr[bytesptr++];
+                        if (ctr > 0) {
+                            arr[arrptr++] = bi;
+                            hascode = 31 * hascode + bi;
+                            ctr--;
+                        }
+                        else {
+                            if (bi >= 240) {
+                                arr[arrptr++] = bi;
+                                hascode = 31 * hascode + bi;
+                                ctr = 3;
+                            }
+                            else if (bi >= 224) {
+                                arr[arrptr++] = bi;
+                                hascode = 31 * hascode + bi;
+                                ctr = 2;
+                            }
+                            else if (bi >= 192) {
+                                arr[arrptr++] = bi;
+                                hascode = 31 * hascode + bi;
+                                ctr = 1;
+                            }
+                            else if (bi == 59) {
+                                isNumber = true;
+                                stationName = new modifiedbytearray(arr, arrptr, hascode);
+                                arr = new byte[100];
+                                arrptr = 0;
+                                hascode = 1;
+                            }
+                            else if (bi == 10) {
+                                hascode = 1;
+                                isNumber = false;
+                                MeasurementAggregator agg = resultmap.get(stationName);
+                                num *= sign;
+                                if (agg == null) {
+                                    agg = new MeasurementAggregator();
+                                    agg.min = num;
+                                    agg.max = num;
+                                    agg.sum = (long) (num);
+                                    agg.count = 1;
+                                    resultmap.put(stationName, agg);
+                                }
+                                else {
+                                    if (agg.min >= num) {
+                                        agg.min = num;
+                                    }
+                                    if (agg.max <= num) {
+                                        agg.max = num;
+                                    }
+                                    agg.sum += (long) (num);
+                                    agg.count++;
+                                }
+                                num = 0;
+                                sign = 1;
+                            }
+                            else {
+                                hascode = 31 * hascode + bi;
+                                if (isNumber) {
+                                    switch (bi) {
+                                        case 0x2E:
+                                            break;
+                                        case 0x2D:
+                                            sign = -1;
+                                            break;
+                                        default:
+                                            num = num * 10 + (bi - 0x30);
+                                    }
+                                }
+                                else {
+                                    arr[arrptr++] = bi;
+                                }
+                            }
+                        }
+                    }
+                }
+                /*
+                 * while (bytesReading < (i[1] - i[0] + 1) && buffer.position() < buffer.limit()) {
+                 * buffer.clear();
+                 * bytesRead = fileChannel.read(buffer);
+                 * buffer.flip();
+                 * while (bytesReading <= (i[1] - i[0]) && buffer.position() < buffer.limit()) {
+                 * bytesReading += 1;
+                 * bi = buffer.get();
+                 * String s;
+                 * if (ctr > 0) {
+                 * hascode = 31 * hascode + bi;
+                 * ctr--;
+                 * }
+                 * else {
+                 * if (bi >= 240) {
+                 * ctr = 3;
+                 * }
+                 * else if (bi >= 224) {
+                 * ctr = 2;
+                 * }
+                 * else if (bi >= 192) {
+                 * ctr = 1;
+                 * }
+                 * else if (bi == 59) {
+                 * isNumber = true;
+                 * System.out.println(buffer);
+                 * stationName = new modifiedbytearray(bbstart, buffer.position() - 1, hascode, buffer);
+                 * hascode = 1;
+                 * bbstart = buffer.position();
+                 * }
+                 * else if (bi == 10) {
+                 * hascode = 1;
+                 * isNumber = false;
+                 * MeasurementAggregator agg = resultmap.get(stationName);
+                 * if (agg == null) {
+                 * agg = new MeasurementAggregator();
+                 * agg.min = num * sign;
+                 * agg.max = num * sign;
+                 * agg.sum = (long) (num * sign);
+                 * agg.count = 1;
+                 * resultmap.put(stationName, agg);
+                 * }
+                 * else {
+                 * agg.min = Math.min(agg.min, num * sign);
+                 * agg.max = Math.max(agg.max, num * sign);
+                 * agg.sum += (long) (num * sign);
+                 * agg.count++;
+                 * }
+                 * num = 1;
+                 * bbstart = buffer.position();
+                 * }
+                 * else {
+                 * hascode = 31 * hascode + bi;
+                 * if (isNumber) {
+                 * switch (bi) {
+                 * case 0x2E:
+                 * break;
+                 * case 0x2D:
+                 * num = num * -1;
+                 * break;
+                 * default:
+                 * num = num * 10 + (bi - 0x30);
+                 * }
+                 * }
+                 * }
+                 * }
+                 * }
+                 * }
+                 */
+                return resultmap;
+            }
+            catch (Exception e) {
+                e.printStackTrace();
+            }
+            return null;
+        }).flatMap(e -> e.entrySet().stream()).collect(groupingBy(e -> e.getKey(), collector)))) {
+            @Override
+            public Object put(Object key, Object value) {
+                return super.put(((modifiedbytearray) key).getStationName(), value);
+            }
+        });
+
+        /*
+         * .map(a -> {
+         * return a.stream().parallel().collect(groupingBy(m -> m.station(), collector));
+         * }).flatMap(m -> m.entrySet()
+         * .stream()
+         */
+        // Get the FileChannel from the FileInputStream
+
+        // System.out.println("time taken:" + (System.nanoTime() - start) / 1000000);
+        // System.out.println(measurements);
+    }
+
+}
+
+class modifiedbytearray {
+    private int length;
+    private byte[] arr;
+    public int hashcode;
+
+    modifiedbytearray(byte[] arr, int length, int hashcode) {
+        this.arr = arr;
+        this.length = length;
+        this.hashcode = hashcode;
+    }
+
+    public String getStationName() {
+        return new String(this.getArr(), 0, length, StandardCharsets.UTF_8);
+    }
+
+    public byte[] getArr() {
+        return this.arr;
+    }
+
+    @Override
+    public String toString() {
+        return getStationName();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        modifiedbytearray b = (modifiedbytearray) obj;
+        return Arrays.equals(this.getArr(), 0, length, b.arr, 0, b.length);
+    }
+
+    public int getHashcode() {
+        return hashcode;
+    }
+
+    @Override
+    public int hashCode() {
+        return hashcode;
+    }
+}

From 7f5f808176c13e080e50fb6649c24a9f0010f8cb Mon Sep 17 00:00:00 2001
From: gonix <d.giedrius+github@gmail.com>
Date: Tue, 16 Jan 2024 23:49:39 +0200
Subject: [PATCH 045/268] CalculateAverage_gonix initial attempt (#413)

---
 calculate_average_gonix.sh                    |  20 +
 .../onebrc/CalculateAverage_gonix.java        | 354 ++++++++++++++++++
 2 files changed, 374 insertions(+)
 create mode 100755 calculate_average_gonix.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java

diff --git a/calculate_average_gonix.sh b/calculate_average_gonix.sh
new file mode 100755
index 000000000..a6f91655f
--- /dev/null
+++ b/calculate_average_gonix.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gonix
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
new file mode 100644
index 000000000..8349d00a7
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
@@ -0,0 +1,354 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class CalculateAverage_gonix {
+
+    private static final String FILE = "./measurements.txt";
+
+    public static void main(String[] args) throws IOException {
+
+        var file = new RandomAccessFile(FILE, "r");
+
+        var res = buildChunks(file).stream().parallel()
+                .flatMap(chunk -> new Aggregator().processChunk(chunk).stream())
+                .collect(Collectors.toMap(
+                        Aggregator.Entry::getKey,
+                        Aggregator.Entry::getValue,
+                        Aggregator.Entry::add,
+                        TreeMap::new));
+
+        System.out.println(res);
+    }
+
+    private static List<MappedByteBuffer> buildChunks(RandomAccessFile file) throws IOException {
+        var fileSize = file.length();
+        var chunkSize = Math.min(Integer.MAX_VALUE - 512, fileSize / Runtime.getRuntime().availableProcessors());
+        if (chunkSize <= 0) {
+            chunkSize = fileSize;
+        }
+        var chunks = new ArrayList<MappedByteBuffer>((int) (fileSize / chunkSize) + 1);
+        var start = 0L;
+        while (start < fileSize) {
+            var pos = start + chunkSize;
+            if (pos < fileSize) {
+                file.seek(pos);
+                while (file.read() != '\n') {
+                    pos += 1;
+                }
+                pos += 1;
+            }
+            else {
+                pos = fileSize;
+            }
+            var buf = file.getChannel().map(FileChannel.MapMode.READ_ONLY, start, pos - start);
+            buf.order(ByteOrder.nativeOrder());
+            chunks.add(buf);
+            start = pos;
+        }
+        return chunks;
+    }
+}
+
+class Aggregator {
+    private static final int MAX_STATIONS = 10_000;
+    private static final int MAX_STATION_SIZE = (100 * 4) / 8 + 5;
+    private static final int INDEX_SIZE = 1024 * 1024;
+    private static final int INDEX_MASK = INDEX_SIZE - 1;
+    private static final int FLD_MAX = 0;
+    private static final int FLD_MIN = 1;
+    private static final int FLD_SUM = 2;
+    private static final int FLD_COUNT = 3;
+
+    // Poor man's hash map: hash code to offset in `mem`.
+    private final int[] index;
+
+    // Contiguous storage of key (station name) and stats fields of all
+    // unique stations.
+    // The idea here is to improve locality so that stats fields would
+    // possibly be already in the CPU cache after we are done comparing
+    // the key.
+    private final long[] mem;
+    private int memUsed;
+
+    Aggregator() {
+        assert ((INDEX_SIZE & (INDEX_SIZE - 1)) == 0) : "INDEX_SIZE must be power of 2";
+        assert (INDEX_SIZE > MAX_STATIONS) : "INDEX_SIZE must be greater than MAX_STATIONS";
+
+        index = new int[INDEX_SIZE];
+        mem = new long[1 + (MAX_STATIONS * MAX_STATION_SIZE)];
+        memUsed = 1;
+    }
+
+    Aggregator processChunk(MappedByteBuffer buf) {
+        // To avoid checking if it is safe to read a whole long near the
+        // end of a chunk, we copy last couple of lines to a padded buffer
+        // and process that part separately.
+        int limit = buf.limit();
+        int pos = Math.max(limit - 16, -1);
+        while (pos >= 0 && buf.get(pos) != '\n') {
+            pos--;
+        }
+        pos++;
+        if (pos > 0) {
+            processChunkLongs(buf, pos);
+        }
+        int tailLen = limit - pos;
+        var tailBuf = ByteBuffer.allocate(tailLen + 8).order(ByteOrder.nativeOrder());
+        buf.get(pos, tailBuf.array(), 0, tailLen);
+        processChunkLongs(tailBuf, tailLen);
+        return this;
+    }
+
+    Aggregator processChunkLongs(ByteBuffer buf, int limit) {
+        int pos = 0;
+        while (pos < limit) {
+
+            int start = pos;
+            int hash = 0;
+            while (true) {
+                // This is a bit ugly, but it is faster than reading by byte.
+                long tmpLong = buf.getLong(pos);
+                if ((tmpLong & 0xFF) == ';') {
+                    break;
+                }
+                if (((tmpLong >>> 8) & 0xFF) == ';') {
+                    hash = (33 * hash) ^ (int) (tmpLong & 0xFF);
+                    pos += 1;
+                    break;
+                }
+                if (((tmpLong >>> 16) & 0xFF) == ';') {
+                    hash = (33 * hash) ^ (int) (tmpLong & 0xFFFF);
+                    pos += 2;
+                    break;
+                }
+                if (((tmpLong >>> 24) & 0xFF) == ';') {
+                    hash = (33 * hash) ^ (int) (tmpLong & 0xFFFFFF);
+                    pos += 3;
+                    break;
+                }
+                if (((tmpLong >>> 32) & 0xFF) == ';') {
+                    hash = (33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF);
+                    pos += 4;
+                    break;
+                }
+                if (((tmpLong >>> 40) & 0xFF) == ';') {
+                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFF);
+                    pos += 5;
+                    break;
+                }
+                if (((tmpLong >>> 48) & 0xFF) == ';') {
+                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFF);
+                    pos += 6;
+                    break;
+                }
+                if (((tmpLong >>> 56) & 0xFF) == ';') {
+                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFF);
+                    pos += 7;
+                    break;
+                }
+                hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFFFF);
+                pos += 8;
+            }
+            hash = (33 * hash) ^ (hash >>> 15);
+            int len = pos - start;
+            assert (buf.get(pos) == ';') : "Expected ';'";
+            pos++;
+
+            int measurement;
+            {
+                long tmpLong = buf.getLong(pos);
+                int sign = 1;
+                if ((tmpLong & 0xFF) == '-') {
+                    sign = -1;
+                    tmpLong >>>= 8;
+                    pos++;
+                }
+                int value;
+                if (((tmpLong >>> 8) & 0xFF) == '.') {
+                    value = (int) (((tmpLong & 0xFF) - '0') * 10 + (((tmpLong >>> 16) & 0xFF) - '0'));
+                    pos += 4;
+                }
+                else {
+                    value = (int) (((tmpLong & 0xFF) - '0') * 100 + (((tmpLong >>> 8) & 0xFF) - '0') * 10 + (((tmpLong >>> 24) & 0xFF) - '0'));
+                    pos += 5;
+                }
+                measurement = sign * value;
+            }
+            assert (buf.get(pos - 1) == '\n') : "Expected '\\n'";
+
+            add(buf, start, len, hash, measurement);
+        }
+
+        return this;
+    }
+
+    public Stream<Entry> stream() {
+        return Arrays.stream(index)
+                .filter(offset -> offset != 0)
+                .mapToObj(offset -> new Entry(mem, offset));
+    }
+
+    private void add(ByteBuffer buf, int start, int len, int hash, int measurement) {
+        int idx = hash & INDEX_MASK;
+        while (true) {
+            if (index[idx] != 0) {
+                int offset = index[idx];
+                if (keyEqual(offset, buf, start, len)) {
+                    int pos = offset + (len >> 3) + 2;
+                    mem[pos + FLD_MIN] = Math.min((int) measurement, (int) mem[pos + FLD_MIN]);
+                    mem[pos + FLD_MAX] = Math.max((int) measurement, (int) mem[pos + FLD_MAX]);
+                    mem[pos + FLD_SUM] += measurement;
+                    mem[pos + FLD_COUNT] += 1;
+                    return;
+                }
+            }
+            else {
+                index[idx] = create(buf, start, len, hash, measurement);
+                return;
+            }
+            idx = (idx + 1) & INDEX_MASK;
+        }
+    }
+
+    private int create(ByteBuffer buf, int start, int len, int hash, int measurement) {
+        int offset = memUsed;
+
+        mem[offset] = len;
+
+        int memPos = offset + 1;
+        int memEndEarly = memPos + (len >> 3);
+        int bufPos = start;
+        int bufEnd = start + len;
+        while (memPos < memEndEarly) {
+            mem[memPos] = buf.getLong(bufPos);
+            memPos += 1;
+            bufPos += 8;
+        }
+        if (bufPos < bufEnd) {
+            int shift = (8 - (len & 7)) << 3; // (8 - (len % 8)) * 8
+            long tmpLong = buf.getLong(bufPos) << shift >>> shift;
+            mem[memPos] = tmpLong;
+        }
+        else {
+            // "consume" extra long - makes math a bit simpler to calculate
+            // fields offset for update.
+            mem[memPos] = 0;
+        }
+
+        memPos += 1;
+        mem[memPos + FLD_MIN] = measurement;
+        mem[memPos + FLD_MAX] = measurement;
+        mem[memPos + FLD_SUM] = measurement;
+        mem[memPos + FLD_COUNT] = 1;
+        memUsed = memPos + 4;
+
+        return offset;
+    }
+
+    private boolean keyEqual(int offset, ByteBuffer buf, int start, int len) {
+        if (len != mem[offset]) {
+            return false;
+        }
+        int memPos = offset + 1;
+        int memEndEarly = memPos + (len >> 3);
+        int bufPos = start;
+        int bufEnd = start + len;
+        while (memPos < memEndEarly) {
+            if (mem[memPos] != buf.getLong(bufPos)) {
+                return false;
+            }
+            memPos += 1;
+            bufPos += 8;
+        }
+        if (bufPos < bufEnd) {
+            int shift = (8 - (len & 7)) << 3; // (8 - (len % 8)) * 8
+            long tmpLong = buf.getLong(bufPos) << shift >>> shift;
+            if (mem[memPos] != tmpLong) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public static class Entry {
+        private final long[] mem;
+        private final int offset;
+        private String key;
+
+        Entry(long[] mem, int offset) {
+            this.mem = mem;
+            this.offset = offset;
+        }
+
+        public String getKey() {
+            if (key == null) {
+                int pos = this.offset;
+                int keyLen = (int) mem[pos++];
+                var tmpBuf = ByteBuffer.allocate(keyLen + 8).order(ByteOrder.nativeOrder());
+                for (int i = 0; i < keyLen; i += 8) {
+                    tmpBuf.putLong(mem[pos++]);
+                }
+                key = new String(tmpBuf.array(), 0, keyLen, StandardCharsets.UTF_8);
+            }
+            return key;
+        }
+
+        public Entry add(Entry other) {
+            int keyLen = (int) mem[offset];
+            int fldOffset = (keyLen >> 3) + 2;
+            int pos = offset + fldOffset;
+            int otherPos = other.offset + fldOffset;
+            long[] otherMem = other.mem;
+            mem[pos + FLD_MIN] = Math.min((int) mem[pos + FLD_MIN], (int) otherMem[otherPos + FLD_MIN]);
+            mem[pos + FLD_MAX] = Math.max((int) mem[pos + FLD_MAX], (int) otherMem[otherPos + FLD_MAX]);
+            mem[pos + FLD_SUM] += otherMem[otherPos + FLD_SUM];
+            mem[pos + FLD_COUNT] += otherMem[otherPos + FLD_COUNT];
+            return this;
+        }
+
+        public Entry getValue() {
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            int keyLen = (int) mem[offset];
+            int pos = offset + (keyLen >> 3) + 2;
+            return round(mem[pos + FLD_MIN])
+                    + "/" + round(((double) mem[pos + FLD_SUM]) / mem[pos + FLD_COUNT])
+                    + "/" + round(mem[pos + FLD_MAX]);
+        }
+
+        private static double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+    }
+}

From 1bbddaaaf6b9a23bfc715b188f2da8908b81efe8 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Tue, 16 Jan 2024 22:55:16 +0100
Subject: [PATCH 046/268] Leaderboard update

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 76f10337b..0aacc7bee 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.218 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.297 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
+|   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
@@ -71,6 +72,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:06.911 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
+|   | 00:06.993 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.809 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
@@ -104,6 +106,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
+|   | 00:12.582 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.763 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |

From 77872e197d1e6237c327bdbd5fbd925648ca4337 Mon Sep 17 00:00:00 2001
From: Roman Musin <995612+roman-r-m@users.noreply.github.com>
Date: Wed, 17 Jan 2024 17:07:56 +0000
Subject: [PATCH 047/268] Version 3 (#455)

---
 .../onebrc/CalculateAverage_roman_r_m.java    | 265 ++++++++++--------
 1 file changed, 154 insertions(+), 111 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
index c869b7d9c..2efb46120 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
@@ -24,17 +24,12 @@
 import java.lang.reflect.Field;
 import java.nio.channels.FileChannel;
 import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.TreeMap;
 import java.util.stream.IntStream;
 
 public class CalculateAverage_roman_r_m {
 
-    public static final int DOT_3_RD_BYTE_MASK = (byte) '.' << 16;
     private static final String FILE = "./measurements.txt";
-    private static MemorySegment ms;
 
     private static Unsafe UNSAFE;
 
@@ -60,7 +55,7 @@ static long find(long l, long mask) {
         return match != 0 ? firstSetByteIndex(match) : -1;
     }
 
-    static long nextNewline(long from) {
+    static long nextNewline(long from, MemorySegment ms) {
         long start = from;
         long i;
         long next = ms.get(ValueLayout.JAVA_LONG_UNALIGNED, start);
@@ -71,6 +66,110 @@ static long nextNewline(long from) {
         return start + i;
     }
 
+    static class Worker {
+        private final MemorySegment ms;
+        private final long end;
+        private long offset;
+
+        public Worker(MemorySegment ms, long start, long end) {
+            this.ms = ms.asSlice(start, end - start);
+            this.offset = 0;
+            this.end = end - start;
+        }
+
+        private void parseName(ByteString station) {
+            long start = offset;
+            long pos = -1;
+
+            while (end - offset > 8) {
+                long next = UNSAFE.getLong(ms.address() + offset);
+                pos = find(next, SEMICOLON_MASK);
+                if (pos >= 0) {
+                    offset += pos;
+                    break;
+                }
+                else {
+                    offset += 8;
+                }
+            }
+            if (pos < 0) {
+                while (UNSAFE.getByte(ms.address() + offset++) != ';') {
+                }
+                offset--;
+            }
+
+            int len = (int) (offset - start);
+            station.offset = start;
+            station.len = len;
+            station.hash = 0;
+
+            offset++;
+        }
+
+        long parseNumberFast() {
+            long encodedVal = UNSAFE.getLong(ms.address() + offset);
+
+            var len = find(encodedVal, LINE_END_MASK);
+            offset += len + 1;
+
+            encodedVal ^= broadcast((byte) 0x30);
+
+            long c0 = len == 4 ? 100 : 10;
+            long c1 = 10 * (len - 3);
+            long c2 = 4 - len;
+            long c3 = len - 3;
+            long a = (encodedVal & 0xFF) * c0;
+            long b = ((encodedVal & 0xFF00) >>> 8) * c1;
+            long c = ((encodedVal & 0xFF0000L) >>> 16) * c2;
+            long d = ((encodedVal & 0xFF000000L) >>> 24) * c3;
+
+            return a + b + c + d;
+        }
+
+        long parseNumberSlow() {
+            long val = UNSAFE.getByte(ms.address() + offset++) - '0';
+            byte b;
+            while ((b = UNSAFE.getByte(ms.address() + offset++)) != '.') {
+                val = val * 10 + (b - '0');
+            }
+            b = UNSAFE.getByte(ms.address() + offset);
+            val = val * 10 + (b - '0');
+            offset += 2;
+            return val;
+        }
+
+        long parseNumber() {
+            long val;
+            int neg = 1 - Integer.bitCount(UNSAFE.getByte(ms.address() + offset) & 0x10);
+            offset += neg;
+
+            if (end - offset > 8) {
+                val = parseNumberFast();
+            }
+            else {
+                val = parseNumberSlow();
+            }
+            val *= 1 - 2 * neg;
+            return val;
+        }
+
+        public TreeMap<String, ResultRow> run() {
+            var resultStore = new ResultStore();
+            var station = new ByteString(ms);
+
+            while (offset < end) {
+                parseName(station);
+                long val = parseNumber();
+                var a = resultStore.get(station);
+                a.min = Math.min(a.min, val);
+                a.max = Math.max(a.max, val);
+                a.sum += val;
+                a.count++;
+            }
+            return resultStore.toMap();
+        }
+    }
+
     public static void main(String[] args) throws Exception {
         Field f = Unsafe.class.getDeclaredField("theUnsafe");
         f.setAccessible(true);
@@ -79,98 +178,18 @@ public static void main(String[] args) throws Exception {
         long fileSize = new File(FILE).length();
 
         var channel = FileChannel.open(Paths.get(FILE));
-        ms = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.ofAuto());
+        MemorySegment ms = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.ofAuto());
 
         int numThreads = fileSize > Integer.MAX_VALUE ? Runtime.getRuntime().availableProcessors() : 1;
         long chunk = fileSize / numThreads;
+
         var result = IntStream.range(0, numThreads)
                 .parallel()
                 .mapToObj(i -> {
                     boolean lastChunk = i == numThreads - 1;
-                    long chunkStart = i == 0 ? 0 : nextNewline(i * chunk) + 1;
-                    long chunkEnd = lastChunk ? fileSize : nextNewline((i + 1) * chunk);
-
-                    var resultStore = new ResultStore();
-                    var station = new ByteString();
-
-                    long offset = chunkStart;
-                    while (offset < chunkEnd) {
-                        long start = offset;
-                        long pos = -1;
-
-                        while (chunkEnd - offset >= 8) {
-                            long next = UNSAFE.getLong(ms.address() + offset);
-                            pos = find(next, SEMICOLON_MASK);
-                            if (pos >= 0) {
-                                offset += pos;
-                                break;
-                            }
-                            else {
-                                offset += 8;
-                            }
-                        }
-                        if (pos < 0) {
-                            while (UNSAFE.getByte(ms.address() + offset++) != ';') {
-                            }
-                            offset--;
-                        }
-
-                        int len = (int) (offset - start);
-                        // TODO can we not copy and use a reference into the memory segment to perform table lookup?
-
-                        station.offset = start;
-                        station.len = len;
-                        station.hash = 0;
-
-                        offset++;
-
-                        long val;
-                        boolean neg;
-                        if (!lastChunk || fileSize - offset >= 8) {
-                            long encodedVal = UNSAFE.getLong(ms.address() + offset);
-                            neg = (encodedVal & (byte) '-') == (byte) '-';
-                            if (neg) {
-                                encodedVal >>= 8;
-                                offset++;
-                            }
-
-                            if ((encodedVal & DOT_3_RD_BYTE_MASK) == DOT_3_RD_BYTE_MASK) {
-                                val = (encodedVal & 0xFF - 0x30) * 100 + (encodedVal >> 8 & 0xFF - 0x30) * 10 + (encodedVal >> 24 & 0xFF - 0x30);
-                                offset += 5;
-                            }
-                            else {
-                                // based on http://0x80.pl/articles/simd-parsing-int-sequences.html#parsing-and-conversion-of-signed-numbers
-                                val = Long.compress(encodedVal, 0xFF00FFL) - 0x303030;
-                                val = ((val * 2561) >> 8) & 0xff;
-                                offset += 4;
-                            }
-                        }
-                        else {
-                            neg = UNSAFE.getByte(ms.address() + offset) == '-';
-                            if (neg) {
-                                offset++;
-                            }
-                            val = UNSAFE.getByte(ms.address() + offset++) - '0';
-                            byte b;
-                            while ((b = UNSAFE.getByte(ms.address() + offset++)) != '.') {
-                                val = val * 10 + (b - '0');
-                            }
-                            b = UNSAFE.getByte(ms.address() + offset);
-                            val = val * 10 + (b - '0');
-                            offset += 2;
-                        }
-
-                        if (neg) {
-                            val = -val;
-                        }
-
-                        var a = resultStore.get(station);
-                        a.min = Math.min(a.min, val);
-                        a.max = Math.max(a.max, val);
-                        a.sum += val;
-                        a.count++;
-                    }
-                    return resultStore.toMap();
+                    long chunkStart = i == 0 ? 0 : nextNewline(i * chunk, ms) + 1;
+                    long chunkEnd = lastChunk ? fileSize : nextNewline((i + 1) * chunk, ms);
+                    return new Worker(ms, chunkStart, chunkEnd).run();
                 }).reduce((m1, m2) -> {
                     m2.forEach((k, v) -> m1.merge(k, v, ResultRow::merge));
                     return m1;
@@ -181,19 +200,24 @@ public static void main(String[] args) throws Exception {
 
     static final class ByteString {
 
+        private final MemorySegment ms;
         private long offset;
         private int len = 0;
         private int hash = 0;
 
+        ByteString(MemorySegment ms) {
+            this.ms = ms;
+        }
+
         @Override
         public String toString() {
             var bytes = new byte[len];
-            MemorySegment.copy(ms, ValueLayout.JAVA_BYTE, offset, bytes, 0, len);
+            UNSAFE.copyMemory(null, ms.address() + offset, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, len);
             return new String(bytes, 0, len);
         }
 
         public ByteString copy() {
-            var copy = new ByteString();
+            var copy = new ByteString(ms);
             copy.offset = this.offset;
             copy.len = this.len;
             copy.hash = this.hash;
@@ -216,13 +240,18 @@ public boolean equals(Object o) {
 
             long base1 = ms.address() + offset;
             long base2 = ms.address() + that.offset;
-            for (; i + 3 < len; i += 4) {
-                int i1 = UNSAFE.getInt(base1 + i);
-                int i2 = UNSAFE.getInt(base2 + i);
-                if (i1 != i2) {
+            for (; i + 7 < len; i += 8) {
+                long l1 = UNSAFE.getLong(base1 + i);
+                long l2 = UNSAFE.getLong(base2 + i);
+                if (l1 != l2) {
                     return false;
                 }
             }
+            if (len >= 8) {
+                long l1 = UNSAFE.getLong(base1 + len - 8);
+                long l2 = UNSAFE.getLong(base2 + len - 8);
+                return l1 == l2;
+            }
             for (; i < len; i++) {
                 byte i1 = UNSAFE.getByte(base1 + i);
                 byte i2 = UNSAFE.getByte(base2 + i);
@@ -236,10 +265,9 @@ public boolean equals(Object o) {
         @Override
         public int hashCode() {
             if (hash == 0) {
-                // not sure why but it seems to be working a bit better
-                hash = UNSAFE.getInt(ms.address() + offset);
-                hash = hash >>> (8 * Math.max(0, 4 - len));
-                hash |= len;
+                long h = UNSAFE.getLong(ms.address() + offset);
+                h = Long.reverseBytes(h) >>> (8 * Math.max(0, 8 - len));
+                hash = (int) (h ^ (h >>> 32));
             }
             return hash;
         }
@@ -269,25 +297,40 @@ public ResultRow merge(ResultRow other) {
     }
 
     static class ResultStore {
-        private final ArrayList<ResultRow> results = new ArrayList<>(10000);
-        private final Map<ByteString, Integer> indices = new HashMap<>(10000);
+        private static final int SIZE = 16384;
+        private final ByteString[] keys = new ByteString[SIZE];
+        private final ResultRow[] values = new ResultRow[SIZE];
 
         ResultRow get(ByteString s) {
-            var idx = indices.get(s);
-            if (idx != null) {
-                return results.get(idx);
+            int h = s.hashCode();
+            int idx = (SIZE - 1) & h;
+
+            int i = 0;
+            while (keys[idx] != null && !keys[idx].equals(s)) {
+                i++;
+                idx = (idx + i * i) % SIZE;
+            }
+            ResultRow result;
+            if (keys[idx] == null) {
+                keys[idx] = s.copy();
+                result = new ResultRow();
+                values[idx] = result;
             }
             else {
-                ResultRow next = new ResultRow();
-                results.add(next);
-                indices.put(s.copy(), results.size() - 1);
-                return next;
+                result = values[idx];
+                // TODO see it it makes any difference
+                // keys[idx].offset = s.offset;
             }
+            return result;
         }
 
         TreeMap<String, ResultRow> toMap() {
             var result = new TreeMap<String, ResultRow>();
-            indices.forEach((name, idx) -> result.put(name.toString(), results.get(idx)));
+            for (int i = 0; i < SIZE; i++) {
+                if (keys[i] != null) {
+                    result.put(keys[i].toString(), values[i]);
+                }
+            }
             return result;
         }
     }

From 927880b97ec3c0ae354773ae645dc8c0bbec8345 Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Wed, 17 Jan 2024 18:28:03 +0100
Subject: [PATCH 048/268] edge-case in hashing fixed (#459)

also a bunch of smaller improvements
---
 calculate_average_jerrinot.sh                 |   2 +-
 .../onebrc/CalculateAverage_jerrinot.java     | 303 +++++++++---------
 2 files changed, 152 insertions(+), 153 deletions(-)

diff --git a/calculate_average_jerrinot.sh b/calculate_average_jerrinot.sh
index 1bbf680fc..8de06c3d6 100755
--- a/calculate_average_jerrinot.sh
+++ b/calculate_average_jerrinot.sh
@@ -17,5 +17,5 @@
 
 # -XX:+UnlockDiagnosticVMOptions -XX:PrintAssemblyOptions=intel -XX:CompileCommand=print,*.CalculateAverage_mtopolnik::recordMeasurementAndAdvanceCursor"
 # -XX:InlineSmallCode=10000 -XX:-TieredCompilation -XX:CICompilerCount=2 -XX:CompileThreshold=1000\
-java --enable-preview \
+java -XX:+UseParallelGC  --enable-preview \
   --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jerrinot
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index 6fb89bb67..5373cb084 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -22,15 +22,24 @@
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
 import java.nio.channels.FileChannel.MapMode;
-import java.util.Map;
-import java.util.TreeMap;
+import java.util.*;
 
+/**
+ * I figured out it would be very hard to win the main competition of the One Billion Rows Challenge.
+ * but I think this code has a good chance to win a special prize for the Ugliest Solution ever! :)
+ *
+ * Anyway, if you can make sense out of not exactly idiomatic Java code, and you enjoy pushing performance limits
+ * then QuestDB - the fastest open-source time-series database - is hiring: https://questdb.io/careers/core-database-engineer/
+ *
+ */
 public class CalculateAverage_jerrinot {
     private static final Unsafe UNSAFE = unsafe();
     private static final String MEASUREMENTS_TXT = "measurements.txt";
     // todo: with hyper-threading enable we would be better of with availableProcessors / 2;
     // todo: validate the testing env. params.
     private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
+    // private static final int THREAD_COUNT = 4;
+
     private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
 
     private static Unsafe unsafe() {
@@ -72,7 +81,7 @@ static void calculate() throws Exception {
             Processor[] processors = new Processor[THREAD_COUNT];
             Thread[] threads = new Thread[THREAD_COUNT];
 
-            for (int i = 0; i < THREAD_COUNT; i++) {
+            for (int i = 0; i < THREAD_COUNT - 1; i++) {
                 long startA = chunkStartOffsets[i * chunkPerThread];
                 long endA = chunkStartOffsets[i * chunkPerThread + 1];
                 long startB = chunkStartOffsets[i * chunkPerThread + 1];
@@ -89,8 +98,22 @@ static void calculate() throws Exception {
                 thread.start();
             }
 
+            int ownIndex = THREAD_COUNT - 1;
+            long startA = chunkStartOffsets[ownIndex * chunkPerThread];
+            long endA = chunkStartOffsets[ownIndex * chunkPerThread + 1];
+            long startB = chunkStartOffsets[ownIndex * chunkPerThread + 1];
+            long endB = chunkStartOffsets[ownIndex * chunkPerThread + 2];
+            long startC = chunkStartOffsets[ownIndex * chunkPerThread + 2];
+            long endC = chunkStartOffsets[ownIndex * chunkPerThread + 3];
+            long startD = chunkStartOffsets[ownIndex * chunkPerThread + 3];
+            long endD = chunkStartOffsets[ownIndex * chunkPerThread + 4];
+            Processor processor = new Processor(startA, endA, startB, endB, startC, endC, startD, endD);
+            processor.run();
+
             var accumulator = new TreeMap<String, Processor.StationStats>();
-            for (int i = 0; i < THREAD_COUNT; i++) {
+            processor.accumulateStatus(accumulator);
+
+            for (int i = 0; i < THREAD_COUNT - 1; i++) {
                 Thread t = threads[i];
                 t.join();
                 processors[i].accumulateStatus(accumulator);
@@ -131,7 +154,7 @@ public static int ceilPow2(int i) {
 
     private static class Processor implements Runnable {
         private static final int MAP_SLOT_COUNT = ceilPow2(10000);
-        private static final int STATION_MAX_NAME_BYTES = 104;
+        private static final int STATION_MAX_NAME_BYTES = 120;
 
         private static final long COUNT_OFFSET = 0;
         private static final long MIN_OFFSET = 4;
@@ -162,23 +185,16 @@ private static class Processor implements Runnable {
         private long endC;
         private long cursorD;
         private long endD;
-        private long maskA;
-        private long maskB;
-        private long maskC;
-        private long maskD;
 
-        // credit: merykitty
-        private long parseAndStoreTemperature(long startCursor, long baseEntryPtr) {
-            long word = UNSAFE.getLong(startCursor);
-            final long negateda = ~word;
-            final int dotPos = Long.numberOfTrailingZeros(negateda & 0x10101000);
-            final long signed = (negateda << 59) >> 63;
-            final long removeSignMask = ~(signed & 0xFF);
-            final long digits = ((word & removeSignMask) << (28 - dotPos)) & 0x0F000F0F00L;
-            final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-            final int temperature = (int) ((absValue ^ signed) - signed);
+        // private long maxClusterLen;
 
+        // credit: merykitty
+        private long parseAndStoreTemperature(long startCursor, long baseEntryPtr, long word) {
+            // long word = UNSAFE.getLong(startCursor);
             long countPtr = baseEntryPtr + COUNT_OFFSET;
+            int cnt = UNSAFE.getInt(countPtr);
+            UNSAFE.putInt(countPtr, cnt + 1);
+
             long minPtr = baseEntryPtr + MIN_OFFSET;
             long maxPtr = baseEntryPtr + MAX_OFFSET;
             long sumPtr = baseEntryPtr + SUM_OFFSET;
@@ -186,16 +202,23 @@ private long parseAndStoreTemperature(long startCursor, long baseEntryPtr) {
             int min = UNSAFE.getInt(minPtr);
             int max = UNSAFE.getInt(maxPtr);
             long sum = UNSAFE.getLong(sumPtr);
-            // try if min/max intrinsics are paying off
-            // maybe braching is better? the branch is becoming more predictable with
-            // each new sample.
-            max = Math.max(max, temperature);
-            min = Math.min(min, temperature);
+
+            final long negateda = ~word;
+            final int dotPos = Long.numberOfTrailingZeros(negateda & 0x10101000);
+            final long signed = (negateda << 59) >> 63;
+            final long removeSignMask = ~(signed & 0xFF);
+            final long digits = ((word & removeSignMask) << (28 - dotPos)) & 0x0F000F0F00L;
+            final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            final int temperature = (int) ((absValue ^ signed) - signed);
             sum += temperature;
-            UNSAFE.putInt(countPtr, UNSAFE.getInt(countPtr) + 1);
-            UNSAFE.putInt(minPtr, min);
-            UNSAFE.putInt(maxPtr, max);
             UNSAFE.putLong(sumPtr, sum);
+
+            if (temperature > max) {
+                UNSAFE.putInt(maxPtr, temperature);
+            }
+            if (temperature < min) {
+                UNSAFE.putInt(minPtr, temperature);
+            }
             return startCursor + (dotPos / 8) + 3;
         }
 
@@ -227,13 +250,13 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
                 int count = UNSAFE.getInt(baseAddress + COUNT_OFFSET);
                 long sum = UNSAFE.getLong(baseAddress + SUM_OFFSET);
 
-                // todo: lambdas bootstrap probably cost us
-                accumulator.compute(name, (_, v) -> {
-                    if (v == null) {
-                        return new StationStats(min, max, count, sum);
-                    }
-                    return new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum);
-                });
+                var v = accumulator.get(name);
+                if (v == null) {
+                    accumulator.put(name, new StationStats(min, max, count, sum));
+                }
+                else {
+                    accumulator.put(name, new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
+                }
             }
         }
 
@@ -260,11 +283,22 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
         private void doTail() {
             // todo: we would be probably better of without all that code dup. ("compilers hates him!")
             // System.out.println("done ILP");
+            doOne(cursorA, endA);
+            // System.out.println("done A");
+            doOne(cursorB, endB);
+            // System.out.println("done B");
+            doOne(cursorC, endC);
+            // System.out.println("done C");
+            doOne(cursorD, endD);
+            // System.out.println("done D");
+        }
+
+        private void doOne(long cursorA, long endA) {
             while (cursorA < endA) {
                 long startA = cursorA;
                 long delimiterWordA = UNSAFE.getLong(cursorA);
                 long hashA = 0;
-                maskA = getDelimiterMask(delimiterWordA);
+                long maskA = getDelimiterMask(delimiterWordA);
                 while (maskA == 0) {
                     hashA ^= delimiterWordA;
                     cursorA += 8;
@@ -273,81 +307,15 @@ private void doTail() {
                 }
                 final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
                 final long semicolonA = cursorA + (delimiterByteA >> 3);
-                final long maskedWordA = delimiterWordA & ((maskA >>> 7) - 1);
+                final long maskedWordA = delimiterWordA & ((maskA - 1) ^ maskA) >>> 8;
                 hashA ^= maskedWordA;
                 int intHashA = (int) (hashA ^ (hashA >> 32));
                 intHashA = intHashA ^ (intHashA >> 17);
 
                 long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, intHashA, maskedWordA);
-                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA);
-            }
-            // System.out.println("done A");
-            while (cursorB < endB) {
-                long startB = cursorB;
-                long delimiterWordB = UNSAFE.getLong(cursorB);
-                long hashB = 0;
-                maskB = getDelimiterMask(delimiterWordB);
-                while (maskB == 0) {
-                    hashB ^= delimiterWordB;
-                    cursorB += 8;
-                    delimiterWordB = UNSAFE.getLong(cursorB);
-                    maskB = getDelimiterMask(delimiterWordB);
-                }
-                final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
-                final long semicolonB = cursorB + (delimiterByteB >> 3);
-                final long maskedWordB = delimiterWordB & ((maskB >>> 7) - 1);
-                hashB ^= maskedWordB;
-                int intHashB = (int) (hashB ^ (hashB >> 32));
-                intHashB = intHashB ^ (intHashB >> 17);
-
-                long baseEntryPtrB = getOrCreateEntryBaseOffset(semicolonB, startB, intHashB, maskedWordB);
-                cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB);
-            }
-            // System.out.println("done B");
-            while (cursorC < endC) {
-                long startC = cursorC;
-                long delimiterWordC = UNSAFE.getLong(cursorC);
-                long hashC = 0;
-                maskC = getDelimiterMask(delimiterWordC);
-                while (maskC == 0) {
-                    hashC ^= delimiterWordC;
-                    cursorC += 8;
-                    delimiterWordC = UNSAFE.getLong(cursorC);
-                    maskC = getDelimiterMask(delimiterWordC);
-                }
-                final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
-                final long semicolonC = cursorC + (delimiterByteC >> 3);
-                final long maskedWordC = delimiterWordC & ((maskC >>> 7) - 1);
-                hashC ^= maskedWordC;
-                int intHashC = (int) (hashC ^ (hashC >> 32));
-                intHashC = intHashC ^ (intHashC >> 17);
-
-                long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, intHashC, maskedWordC);
-                cursorC = parseAndStoreTemperature(semicolonC + 1, baseEntryPtrC);
-            }
-            // System.out.println("done C");
-            while (cursorD < endD) {
-                long startD = cursorD;
-                long delimiterWordD = UNSAFE.getLong(cursorD);
-                long hashD = 0;
-                maskD = getDelimiterMask(delimiterWordD);
-                while (maskD == 0) {
-                    hashD ^= delimiterWordD;
-                    cursorD += 8;
-                    delimiterWordD = UNSAFE.getLong(cursorD);
-                    maskD = getDelimiterMask(delimiterWordD);
-                }
-                final int delimiterByteD = Long.numberOfTrailingZeros(maskD);
-                final long semicolonD = cursorD + (delimiterByteD >> 3);
-                final long maskedWordD = delimiterWordD & ((maskD >>> 7) - 1);
-                hashD ^= maskedWordD;
-                int intHashD = (int) (hashD ^ (hashD >> 32));
-                intHashD = intHashD ^ (intHashD >> 17);
-
-                long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, intHashD, maskedWordD);
-                cursorD = parseAndStoreTemperature(semicolonD + 1, baseEntryPtrD);
+                long temperatureWordA = UNSAFE.getLong(semicolonA + 1);
+                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA, temperatureWordA);
             }
-            // System.out.println("done D");
         }
 
         @Override
@@ -359,10 +327,14 @@ public void run() {
                 long startC = cursorC;
                 long startD = cursorD;
 
-                long delimiterWordA = UNSAFE.getLong(cursorA);
-                long delimiterWordB = UNSAFE.getLong(cursorB);
-                long delimiterWordC = UNSAFE.getLong(cursorC);
-                long delimiterWordD = UNSAFE.getLong(cursorD);
+                long currentWordA = UNSAFE.getLong(startA);
+                // long delimiterWordA2 = UNSAFE.getLong(startA + 8);
+                long currentWordB = UNSAFE.getLong(startB);
+                // long delimiterWordB2 = UNSAFE.getLong(startB + 8);
+                long currentWordC = UNSAFE.getLong(startC);
+                // long delimiterWordCa = UNSAFE.getLong(startC + 8);
+                long currentWordD = UNSAFE.getLong(startD);
+                // long delimiterWordD2 = UNSAFE.getLong(startD + 8);
 
                 long hashA = 0;
                 long hashB = 0;
@@ -370,58 +342,62 @@ public void run() {
                 long hashD = 0;
 
                 // credits for the hashing idea: royvanrijn
-                maskA = getDelimiterMask(delimiterWordA);
+                long maskA = getDelimiterMask(currentWordA);
                 while (maskA == 0) {
-                    hashA ^= delimiterWordA;
+                    hashA ^= currentWordA;
                     cursorA += 8;
-                    delimiterWordA = UNSAFE.getLong(cursorA);
-                    maskA = getDelimiterMask(delimiterWordA);
+                    currentWordA = UNSAFE.getLong(cursorA);
+                    maskA = getDelimiterMask(currentWordA);
                 }
                 final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
                 final long semicolonA = cursorA + (delimiterByteA >> 3);
-                final long maskedWordA = delimiterWordA & ((maskA >>> 7) - 1);
+                long temperatureWordA = UNSAFE.getLong(semicolonA + 1);
+                final long maskedWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
                 hashA ^= maskedWordA;
                 int intHashA = (int) (hashA ^ (hashA >> 32));
                 intHashA = intHashA ^ (intHashA >> 17);
 
-                maskB = getDelimiterMask(delimiterWordB);
+                long maskB = getDelimiterMask(currentWordB);
                 while (maskB == 0) {
-                    hashB ^= delimiterWordB;
+                    hashB ^= currentWordB;
                     cursorB += 8;
-                    delimiterWordB = UNSAFE.getLong(cursorB);
-                    maskB = getDelimiterMask(delimiterWordB);
+                    currentWordB = UNSAFE.getLong(cursorB);
+                    maskB = getDelimiterMask(currentWordB);
                 }
                 final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
                 final long semicolonB = cursorB + (delimiterByteB >> 3);
-                final long maskedWordB = delimiterWordB & ((maskB >>> 7) - 1);
+                long temperatureWordB = UNSAFE.getLong(semicolonB + 1);
+                final long maskedWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
                 hashB ^= maskedWordB;
                 int intHashB = (int) (hashB ^ (hashB >> 32));
                 intHashB = intHashB ^ (intHashB >> 17);
 
-                maskC = getDelimiterMask(delimiterWordC);
+                long maskC = getDelimiterMask(currentWordC);
                 while (maskC == 0) {
-                    hashC ^= delimiterWordC;
+                    hashC ^= currentWordC;
                     cursorC += 8;
-                    delimiterWordC = UNSAFE.getLong(cursorC);
-                    maskC = getDelimiterMask(delimiterWordC);
+                    currentWordC = UNSAFE.getLong(cursorC);
+                    maskC = getDelimiterMask(currentWordC);
                 }
                 final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
                 final long semicolonC = cursorC + (delimiterByteC >> 3);
-                final long maskedWordC = delimiterWordC & ((maskC >>> 7) - 1);
+                long temperatureWordC = UNSAFE.getLong(semicolonC + 1);
+                final long maskedWordC = currentWordC & ((maskC - 1) ^ maskC) >>> 8;
                 hashC ^= maskedWordC;
                 int intHashC = (int) (hashC ^ (hashC >> 32));
                 intHashC = intHashC ^ (intHashC >> 17);
 
-                maskD = getDelimiterMask(delimiterWordD);
+                long maskD = getDelimiterMask(currentWordD);
                 while (maskD == 0) {
-                    hashD ^= delimiterWordD;
+                    hashD ^= currentWordD;
                     cursorD += 8;
-                    delimiterWordD = UNSAFE.getLong(cursorD);
-                    maskD = getDelimiterMask(delimiterWordD);
+                    currentWordD = UNSAFE.getLong(cursorD);
+                    maskD = getDelimiterMask(currentWordD);
                 }
                 final int delimiterByteD = Long.numberOfTrailingZeros(maskD);
                 final long semicolonD = cursorD + (delimiterByteD >> 3);
-                final long maskedWordD = delimiterWordD & ((maskD >>> 7) - 1);
+                long temperatureWordD = UNSAFE.getLong(semicolonD + 1);
+                final long maskedWordD = currentWordD & ((maskD - 1) ^ maskD) >>> 8;
                 hashD ^= maskedWordD;
                 int intHashD = (int) (hashD ^ (hashD >> 32));
                 intHashD = intHashD ^ (intHashD >> 17);
@@ -431,51 +407,74 @@ public void run() {
                 long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, intHashC, maskedWordC);
                 long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, intHashD, maskedWordD);
 
-                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA);
-                cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB);
-                cursorC = parseAndStoreTemperature(semicolonC + 1, baseEntryPtrC);
-                cursorD = parseAndStoreTemperature(semicolonD + 1, baseEntryPtrD);
+                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA, temperatureWordA);
+                cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB, temperatureWordB);
+                cursorC = parseAndStoreTemperature(semicolonC + 1, baseEntryPtrC, temperatureWordC);
+                cursorD = parseAndStoreTemperature(semicolonD + 1, baseEntryPtrD, temperatureWordD);
             }
             doTail();
         }
 
         private long getOrCreateEntryBaseOffset(long semicolonA, long startA, int intHashA, long maskedWordA) {
-            int lenA = (int) (semicolonA - startA);
+            // hashSet.add(intHashA);
+            long lenLong = semicolonA - startA;
+            int lenA = (int) lenLong;
+
+            // assert lenA != 0;
+            // byte[] nameArr = new byte[lenA];
+            // for (int i = 0; i < lenA; i++) {
+            // nameArr[i] = UNSAFE.getByte(startA + i);
+            // }
+            // String nameStr = new String(nameArr);
+            // Integer oldHash = nameToHash.put(nameStr, intHashA);
+            // assert oldHash == null || oldHash == intHashA : "name: " + nameStr + ", old hash = " + oldHash + ", new hash = " + intHashA;
+
             long mapIndexA = intHashA & MAP_MASK;
+            // long clusterLen = 0;
             for (;;) {
                 long basePtr = mapIndexA * MAP_ENTRY_SIZE_BYTES + map;
                 long lenPtr = basePtr + LEN_OFFSET;
                 int len = UNSAFE.getInt(lenPtr);
-                if (len == 0) {
+                if (len == lenA) {
+                    if (nameMatch(startA, maskedWordA, basePtr, lenLong)) {
+                        // if (clusterLen > maxClusterLen) {
+                        // maxClusterLen = clusterLen;
+                        // System.out.println("max cluster len: " + clusterLen);
+                        // }
+                        return basePtr;
+                    }
+                }
+                else if (len == 0) {
                     // todo: uncommon branch maybe?
                     // empty slot
                     UNSAFE.copyMemory(semicolonA - lenA, basePtr + NAME_OFFSET, lenA);
                     UNSAFE.putInt(lenPtr, lenA);
+                    // todo: this could be a single putLong()
                     UNSAFE.putInt(basePtr + MAX_OFFSET, Integer.MIN_VALUE);
                     UNSAFE.putInt(basePtr + MIN_OFFSET, Integer.MAX_VALUE);
                     return basePtr;
                 }
-                if (len == lenA) {
-                    boolean match = true;
-                    long namePtr = basePtr + NAME_OFFSET;
-                    int fullLen = (len >> 3) << 3;
-                    long offset;
-                    // todo: this is worth exploring further.
-                    // @mtopolnik has an interesting algo with 2 unconditioned long loads: this is sufficient
-                    // for majority of names. so we would be left with just a single branch which is almost never taken?
-                    for (offset = 0; offset < fullLen; offset += 8) {
-                        match &= (UNSAFE.getLong(startA + offset) == UNSAFE.getLong(namePtr + offset));
-                    }
-
-                    long maskedWordInMap = UNSAFE.getLong(namePtr + offset);
-                    match &= (maskedWordInMap == maskedWordA);
+                mapIndexA = ++mapIndexA & MAP_MASK;
+                // clusterLen++;
+            }
+        }
 
-                    if (match) {
-                        return basePtr;
-                    }
+        private static boolean nameMatch(long startA, long maskedWordA, long basePtr, long len) {
+            long namePtr = basePtr + NAME_OFFSET;
+            long fullLen = len & ~7L;
+            long offset;
+
+            // todo: this is worth exploring further.
+            // @mtopolnik has an interesting algo with 2 unconditioned long loads: this is sufficient
+            // for majority of names. so we would be left with just a single branch which is almost never taken?
+            for (offset = 0; offset < fullLen; offset += 8) {
+                if (UNSAFE.getLong(startA + offset) != UNSAFE.getLong(namePtr + offset)) {
+                    return false;
                 }
-                mapIndexA = ++mapIndexA & MAP_MASK;
             }
+
+            long maskedWordInMap = UNSAFE.getLong(namePtr + fullLen);
+            return (maskedWordInMap == maskedWordA);
         }
     }
 

From 765583e7d89c7cc879d8e67158a228a78d4c2b71 Mon Sep 17 00:00:00 2001
From: zerninv <zerninvasilii@yandex.ru>
Date: Wed, 17 Jan 2024 17:35:22 +0000
Subject: [PATCH 049/268] improve equality check performance, use graal jvm
 (#454)

---
 prepare_zerninv.sh                            |  20 +++
 .../onebrc/CalculateAverage_zerninv.java      | 129 +++++++++---------
 2 files changed, 86 insertions(+), 63 deletions(-)
 create mode 100755 prepare_zerninv.sh

diff --git a/prepare_zerninv.sh b/prepare_zerninv.sh
new file mode 100755
index 000000000..cd3641e0e
--- /dev/null
+++ b/prepare_zerninv.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
index 2e7ea4c1e..42cf6b827 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
@@ -31,39 +31,36 @@
 
 public class CalculateAverage_zerninv {
     private static final String FILE = "./measurements.txt";
-    private static final int L3_CACHE_SIZE = 128 * 1024 * 1024;
     private static final int CORES = Runtime.getRuntime().availableProcessors();
-    private static final int CHUNK_SIZE = (L3_CACHE_SIZE - MeasurementContainer.SIZE * MeasurementContainer.ENTRY_SIZE * CORES) / CORES - 1024 * CORES;
-
-    // #.##
-    private static final int THREE_DIGITS_MASK = 0x2e0000;
-    // #.#
-    private static final int TWO_DIGITS_MASK = 0x2e00;
-    // #.#-
-    private static final int TWO_NEGATIVE_DIGITS_MASK = 0x2e002d;
-    private static final int BYTE_MASK = 0xff;
-    private static final int ZERO = '0';
-
-    private static final byte DELIMITER = ';';
-    private static final byte LINE_SEPARATOR = '\n';
+    private static final int CHUNK_SIZE = 1024 * 1024 * 32;
 
     private static final Unsafe UNSAFE = initUnsafe();
 
+    private static Unsafe initUnsafe() {
+        try {
+            Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            unsafe.setAccessible(true);
+            return (Unsafe) unsafe.get(Unsafe.class);
+        }
+        catch (IllegalAccessException | NoSuchFieldException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
     public static void main(String[] args) throws IOException, InterruptedException {
         try (var channel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
             var fileSize = channel.size();
             var minChunkSize = Math.min(fileSize, CHUNK_SIZE);
+            var segment = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
 
             var tasks = new TaskThread[CORES];
             for (int i = 0; i < tasks.length; i++) {
                 tasks[i] = new TaskThread(new MeasurementContainer(), (int) (fileSize / minChunkSize / CORES + 1));
             }
 
-            var memorySegment = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
-            var address = memorySegment.address();
-            var chunks = splitByChunks(address, address + fileSize, minChunkSize);
+            var chunks = splitByChunks(segment.address(), segment.address() + fileSize, minChunkSize);
             for (int i = 0; i < chunks.size() - 1; i++) {
-                var task = tasks[i % CORES];
+                var task = tasks[i % tasks.length];
                 task.addChunk(chunks.get(i), chunks.get(i + 1));
             }
 
@@ -93,23 +90,12 @@ public static void main(String[] args) throws IOException, InterruptedException
         }
     }
 
-    private static Unsafe initUnsafe() {
-        try {
-            Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
-            unsafe.setAccessible(true);
-            return (Unsafe) unsafe.get(Unsafe.class);
-        }
-        catch (IllegalAccessException | NoSuchFieldException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
     private static List<Long> splitByChunks(long address, long end, long minChunkSize) {
         List<Long> result = new ArrayList<>((int) ((end - address) / minChunkSize + 1));
         result.add(address);
         while (address < end) {
             address += Math.min(end - address, minChunkSize);
-            while (address < end && UNSAFE.getByte(address++) != LINE_SEPARATOR) {
+            while (address < end && UNSAFE.getByte(address++) != '\n') {
             }
             result.add(address);
         }
@@ -141,7 +127,7 @@ public void merge(TemperatureAggregation o) {
 
         @Override
         public String toString() {
-            return String.format("%.1f/%.1f/%.1f", min / 10d, sum / 10d / count, max / 10d);
+            return min / 10d + "/" + Math.round(sum / 1d / count) / 10d + "/" + max / 10d;
         }
     }
 
@@ -149,55 +135,59 @@ private record Measurement(String station, TemperatureAggregation aggregation) {
     }
 
     private static final class MeasurementContainer {
-        private static final int SIZE = 1024 * 16;
+        private static final int SIZE = 1 << 17;
 
-        private static final int ENTRY_SIZE = 4 + 4 + 1 + 8 + 8 + 2 + 2;
+        private static final int ENTRY_SIZE = 4 + 4 + 8 + 1 + 8 + 8 + 2 + 2;
         private static final int COUNT_OFFSET = 0;
         private static final int HASH_OFFSET = 4;
-        private static final int SIZE_OFFSET = 8;
-        private static final int ADDRESS_OFFSET = 9;
-        private static final int SUM_OFFSET = 17;
-        private static final int MIN_OFFSET = 25;
-        private static final int MAX_OFFSET = 27;
+        private static final int LAST_BYTES_OFFSET = 8;
+        private static final int SIZE_OFFSET = 16;
+        private static final int ADDRESS_OFFSET = 17;
+        private static final int SUM_OFFSET = 25;
+        private static final int MIN_OFFSET = 33;
+        private static final int MAX_OFFSET = 35;
 
         private final long address;
 
         private MeasurementContainer() {
             address = UNSAFE.allocateMemory(ENTRY_SIZE * SIZE);
             UNSAFE.setMemory(address, ENTRY_SIZE * SIZE, (byte) 0);
-            for (long ptr = address; ptr < address + SIZE * ENTRY_SIZE; ptr += ENTRY_SIZE) {
-                UNSAFE.putShort(ptr + MIN_OFFSET, Short.MAX_VALUE);
-                UNSAFE.putShort(ptr + MAX_OFFSET, Short.MIN_VALUE);
-            }
         }
 
-        public void put(long address, byte size, int hash, short value) {
+        public void put(long address, byte size, int hash, long lastBytes, short value) {
             int idx = Math.abs(hash % SIZE);
             long ptr = this.address + idx * ENTRY_SIZE;
             int count;
+            boolean fastEqual;
 
             while ((count = UNSAFE.getInt(ptr + COUNT_OFFSET)) != 0) {
-                if (UNSAFE.getInt(ptr + HASH_OFFSET) == hash
-                        && UNSAFE.getByte(ptr + SIZE_OFFSET) == size
-                        && isEqual(UNSAFE.getLong(ptr + ADDRESS_OFFSET), address, size)) {
-                    break;
+                fastEqual = UNSAFE.getInt(ptr + HASH_OFFSET) == hash && UNSAFE.getLong(ptr + LAST_BYTES_OFFSET) == lastBytes;
+                if (fastEqual && UNSAFE.getByte(ptr + SIZE_OFFSET) == size && isEqual(UNSAFE.getLong(ptr + ADDRESS_OFFSET), address, size - 8)) {
+
+                    UNSAFE.putInt(ptr + COUNT_OFFSET, count + 1);
+                    UNSAFE.putLong(ptr + ADDRESS_OFFSET, address);
+                    UNSAFE.putLong(ptr + SUM_OFFSET, UNSAFE.getLong(ptr + SUM_OFFSET) + value);
+                    if (value < UNSAFE.getShort(ptr + MIN_OFFSET)) {
+                        UNSAFE.putShort(ptr + MIN_OFFSET, value);
+                    }
+                    if (value > UNSAFE.getShort(ptr + MAX_OFFSET)) {
+                        UNSAFE.putShort(ptr + MAX_OFFSET, value);
+                    }
+                    return;
                 }
                 idx = (idx + 1) % SIZE;
                 ptr = this.address + idx * ENTRY_SIZE;
             }
 
-            UNSAFE.putInt(ptr + COUNT_OFFSET, count + 1);
+            UNSAFE.putInt(ptr + COUNT_OFFSET, 1);
             UNSAFE.putInt(ptr + HASH_OFFSET, hash);
+            UNSAFE.putLong(ptr + LAST_BYTES_OFFSET, lastBytes);
             UNSAFE.putByte(ptr + SIZE_OFFSET, size);
             UNSAFE.putLong(ptr + ADDRESS_OFFSET, address);
 
-            UNSAFE.putLong(ptr + SUM_OFFSET, UNSAFE.getLong(ptr + SUM_OFFSET) + value);
-            if (value < UNSAFE.getShort(ptr + MIN_OFFSET)) {
-                UNSAFE.putShort(ptr + MIN_OFFSET, value);
-            }
-            if (value > UNSAFE.getShort(ptr + MAX_OFFSET)) {
-                UNSAFE.putShort(ptr + MAX_OFFSET, value);
-            }
+            UNSAFE.putLong(ptr + SUM_OFFSET, value);
+            UNSAFE.putShort(ptr + MIN_OFFSET, value);
+            UNSAFE.putShort(ptr + MAX_OFFSET, value);
         }
 
         public List<Measurement> measurements() {
@@ -207,21 +197,21 @@ public List<Measurement> measurements() {
                 long ptr = this.address + i * ENTRY_SIZE;
                 count = UNSAFE.getInt(ptr + COUNT_OFFSET);
                 if (count != 0) {
+                    var station = createString(UNSAFE.getLong(ptr + ADDRESS_OFFSET), UNSAFE.getByte(ptr + SIZE_OFFSET));
                     var measurements = new TemperatureAggregation(
                             UNSAFE.getLong(ptr + SUM_OFFSET),
                             count,
                             UNSAFE.getShort(ptr + MIN_OFFSET),
                             UNSAFE.getShort(ptr + MAX_OFFSET));
-                    var key = createString(UNSAFE.getLong(ptr + ADDRESS_OFFSET), UNSAFE.getByte(ptr + SIZE_OFFSET));
-                    result.add(new Measurement(key, measurements));
+                    result.add(new Measurement(station, measurements));
                 }
             }
             return result;
         }
 
-        private static boolean isEqual(long address, long address2, byte size) {
-            for (int i = 0; i < size; i++) {
-                if (UNSAFE.getByte(address + i) != UNSAFE.getByte(address2 + i)) {
+        private boolean isEqual(long address, long address2, int size) {
+            for (int i = 0; i < size; i += 8) {
+                if (UNSAFE.getLong(address + i) != UNSAFE.getLong(address2 + i)) {
                     return false;
                 }
             }
@@ -238,6 +228,17 @@ private String createString(long address, byte size) {
     }
 
     private static class TaskThread extends Thread {
+        // #.##
+        private static final int THREE_DIGITS_MASK = 0x2e0000;
+        // #.#
+        private static final int TWO_DIGITS_MASK = 0x2e00;
+        // #.#-
+        private static final int TWO_NEGATIVE_DIGITS_MASK = 0x2e002d;
+        private static final int BYTE_MASK = 0xff;
+
+        private static final int ZERO = '0';
+        private static final byte DELIMITER = ';';
+
         private final MeasurementContainer container;
         private final List<Long> begins;
         private final List<Long> ends;
@@ -265,15 +266,17 @@ public List<Measurement> measurements() {
         }
 
         private void calcForChunk(long offset, long end) {
-            long cityOffset;
+            long cityOffset, lastBytes;
             int hashCode, temperature, word;
             byte cityNameSize, b;
 
             while (offset < end) {
                 cityOffset = offset;
+                lastBytes = 0;
                 hashCode = 0;
                 while ((b = UNSAFE.getByte(offset++)) != DELIMITER) {
-                    hashCode = hashCode * 31 + b;
+                    hashCode += hashCode * 31 + b;
+                    lastBytes = (lastBytes << 8) | b;
                 }
                 cityNameSize = (byte) (offset - cityOffset - 1);
 
@@ -297,7 +300,7 @@ else if ((word & TWO_DIGITS_MASK) == TWO_DIGITS_MASK) {
                     temperature = ZERO * 111 - ((word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK));
                 }
                 offset++;
-                container.put(cityOffset, cityNameSize, hashCode, (short) temperature);
+                container.put(cityOffset, cityNameSize, hashCode, lastBytes, (short) temperature);
             }
         }
     }

From e549efa3afbf2020d12c2ff84f6c23b9763b2cc1 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <ianopolous@protonmail.com>
Date: Wed, 17 Jan 2024 17:44:02 +0000
Subject: [PATCH 050/268] A fast implementation without unsafe (#462)

---
 calculate_average_ianopolousfast.sh           |  19 ++
 .../CalculateAverage_ianopolousfast.java      | 266 ++++++++++++++++++
 2 files changed, 285 insertions(+)
 create mode 100755 calculate_average_ianopolousfast.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java

diff --git a/calculate_average_ianopolousfast.sh b/calculate_average_ianopolousfast.sh
new file mode 100755
index 000000000..e5c0977e0
--- /dev/null
+++ b/calculate_average_ianopolousfast.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ianopolousfast
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
new file mode 100644
index 000000000..a8c4e4cd1
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -0,0 +1,266 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.*;
+import java.nio.*;
+import java.nio.channels.*;
+import java.util.concurrent.*;
+import java.util.stream.*;
+import java.util.*;
+
+/* A fast implementation with no unsafe.
+ * Features:
+ * * memory mapped file
+ * * read chunks in parallel
+ * * minimise allocation
+ * * no unsafe
+ *
+ * Timings on 4 core i7-7500U CPU @ 2.70GHz:
+ * average_baseline: 4m48s
+ * ianopolous:         19s
+*/
+public class CalculateAverage_ianopolousfast {
+
+    public static final int MAX_LINE_LENGTH = 107;
+    public static final int MAX_STATIONS = 10_000;
+
+    public static void main(String[] args) throws Exception {
+        File input = new File("./measurements.txt");
+        long filesize = input.length();
+        // keep chunk size between 256 MB and 1G (1 chunk for files < 256MB)
+        long chunkSize = Math.min(Math.max((filesize + 31) / 32, 256 * 1024 * 1024), 1024 * 1024 * 1024L);
+        int nChunks = (int) ((filesize + chunkSize - 1) / chunkSize);
+        ExecutorService pool = Executors.newVirtualThreadPerTaskExecutor();
+        List<Future<List<List<Stat>>>> allResults = IntStream.range(0, nChunks)
+                .mapToObj(i -> pool.submit(() -> parseStats(i * chunkSize, Math.min((i + 1) * chunkSize, filesize))))
+                .toList();
+
+        TreeMap<String, Stat> merged = allResults.stream()
+                .parallel()
+                .flatMap(f -> {
+                    try {
+                        return f.get().stream().filter(Objects::nonNull).flatMap(Collection::stream);
+                    }
+                    catch (Exception e) {
+                        e.printStackTrace();
+                        return Stream.empty();
+                    }
+                })
+                .collect(Collectors.toMap(s -> s.name(), s -> s, (a, b) -> a.merge(b), TreeMap::new));
+        System.out.println(merged);
+    }
+
+    public static boolean matchingStationBytes(int start, int end, ByteBuffer buffer, Stat existing) {
+        if (end - start != existing.name.length)
+            return false;
+        for (int i = start; i < end; i++) {
+            if (existing.name[i - start] != buffer.get(i))
+                return false;
+        }
+        return true;
+    }
+
+    public static Stat dedupeStation(int start, int end, long hash, ByteBuffer buffer, List<List<Stat>> stations) {
+        int index = Math.floorMod(hash ^ (hash >> 32), MAX_STATIONS);
+        List<Stat> matches = stations.get(index);
+        if (matches == null) {
+            List<Stat> value = new ArrayList<>();
+            byte[] stationBuffer = new byte[end - start];
+            buffer.position(start);
+            buffer.get(stationBuffer);
+            Stat res = new Stat(stationBuffer);
+            value.add(res);
+            stations.set(index, value);
+            return res;
+        }
+        else {
+            for (int i = 0; i < matches.size(); i++) {
+                Stat s = matches.get(i);
+                if (matchingStationBytes(start, end, buffer, s))
+                    return s;
+            }
+            byte[] stationBuffer = new byte[end - start];
+            buffer.position(start);
+            buffer.get(stationBuffer);
+            Stat res = new Stat(stationBuffer);
+            matches.add(res);
+            return res;
+        }
+    }
+
+    public static int getSemicolon(long d) {
+        // from Hacker's Delight page 92
+        d = d ^ 0x3b3b3b3b3b3b3b3bL;
+        long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
+        y = ~(y | d | 0x7f7f7f7f7f7f7f7fL);
+        return Long.numberOfLeadingZeros(y) >> 3;
+    }
+
+    public static long updateHash(long hash, long x) {
+        return ((hash << 5) ^ x) * 0x517cc1b727220a95L; // fxHash
+    }
+
+    public static Stat parseStation(int lineStart, ByteBuffer buffer, List<List<Stat>> stations) {
+        // find semicolon and update hash as we go, reading a long at a time
+        long d = buffer.getLong(lineStart);
+
+        int semiIndex = getSemicolon(d);
+        int index = 0;
+        long hash = 0;
+        while (semiIndex == 8) {
+            hash = updateHash(hash, d);
+            index += 8;
+            d = buffer.getLong(lineStart + index);
+            semiIndex = getSemicolon(d);
+        }
+        // mask extra bytes off last long
+        d = d & (-1L << ((8 - semiIndex) * 8));
+        if (semiIndex > 0) {
+            hash = updateHash(hash, d);
+        }
+        return dedupeStation(lineStart, lineStart + index + semiIndex, hash, buffer, stations);
+    }
+
+    public static int processTemperature(int lineSplit, MappedByteBuffer buffer, Stat station) {
+        short temperature;
+        boolean negative = false;
+        byte b = buffer.get(lineSplit++);
+        if (b == '-') {
+            negative = true;
+            b = buffer.get(lineSplit++);
+        }
+        temperature = (short) (b - 0x30);
+        b = buffer.get(lineSplit++);
+        if (b == '.') {
+            b = buffer.get(lineSplit++);
+            temperature = (short) (temperature * 10 + (b - 0x30));
+        }
+        else {
+            temperature = (short) (temperature * 10 + (b - 0x30));
+            lineSplit++;
+            b = buffer.get(lineSplit++);
+            temperature = (short) (temperature * 10 + (b - 0x30));
+        }
+        temperature = negative ? (short) -temperature : temperature;
+        station.add(temperature);
+        return lineSplit + 1;
+    }
+
+    public static List<List<Stat>> parseStats(long startByte, long endByte) {
+        try {
+            RandomAccessFile file = new RandomAccessFile("./measurements.txt", "r");
+            long maxEnd = Math.min(file.length(), endByte + MAX_LINE_LENGTH);
+            long len = maxEnd - startByte;
+            if (len > Integer.MAX_VALUE)
+                throw new RuntimeException("Segment size must fit into an int");
+            int maxDone = (int) (endByte - startByte);
+            MappedByteBuffer buffer = file.getChannel().map(FileChannel.MapMode.READ_ONLY, startByte, len);
+            int done = 0;
+            // read first partial line
+            if (startByte > 0) {
+                for (int i = 0; i < MAX_LINE_LENGTH; i++) {
+                    byte b = buffer.get(i);
+                    if (b == '\n') {
+                        done = i + 1;
+                        break;
+                    }
+                }
+            }
+
+            List<List<Stat>> stations = new ArrayList<>(MAX_STATIONS);
+            for (int i = 0; i < MAX_STATIONS; i++)
+                stations.add(null);
+
+            // Handle reading the very last line in the file
+            // this allows us to not worry about reading a long beyond the end
+            // in the inner loop (reducing branches)
+            // We only need to read one because the min record size is 6 bytes
+            // so 2nd last record must be > 8 from end
+            if (endByte == file.length()) {
+                int offset = (int) (file.length() - startByte - 1);
+                while (buffer.get(offset) != '\n') // final new line
+                    offset--;
+                offset--;
+                while (offset > 0 && buffer.get(offset) != '\n') // end of second last line
+                    offset--;
+                maxDone = offset;
+                if (offset > 0)
+                    offset++;
+                // copy into a 8n sized buffer to avoid reading off end
+                int roundedSize = (int) (file.length() - startByte) - offset;
+                roundedSize = (roundedSize + 7) / 8 * 8;
+                byte[] end = new byte[roundedSize];
+                for (int i = offset; i < (int) (file.length() - startByte); i++)
+                    end[i - offset] = buffer.get(i);
+                Stat station = parseStation(0, ByteBuffer.wrap(end), stations);
+                processTemperature(offset + station.name.length + 1, buffer, station);
+            }
+
+            int lineStart = done;
+            while (lineStart < maxDone) {
+                Stat station = parseStation(lineStart, buffer, stations);
+                lineStart = processTemperature(lineStart + station.name.length + 1, buffer, station);
+            }
+            return stations;
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static class Stat {
+        final byte[] name;
+        int count = 0;
+        short min = Short.MAX_VALUE, max = Short.MIN_VALUE;
+        long total = 0;
+
+        public Stat(byte[] name) {
+            this.name = name;
+        }
+
+        public void add(short value) {
+            if (value < min)
+                min = value;
+            if (value > max)
+                max = value;
+            total += value;
+            count++;
+        }
+
+        public Stat merge(Stat value) {
+            if (value.min < min)
+                min = value.min;
+            if (value.max > max)
+                max = value.max;
+            total += value.total;
+            count += value.count;
+            return this;
+        }
+
+        private static double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+
+        public String name() {
+            return new String(name);
+        }
+
+        public String toString() {
+            return round((double) min) + "/" + round(((double) total) / count) + "/" + round((double) max);
+        }
+    }
+}

From 27b9232b7d1475f76a3e43cb65e5e4eb84aaa1e3 Mon Sep 17 00:00:00 2001
From: gonix <d.giedrius+github@gmail.com>
Date: Wed, 17 Jan 2024 19:48:05 +0200
Subject: [PATCH 051/268] CalculateAverage_gonix update (#461)

Co-authored-by: Giedrius D <d.giedrius@gmail.com>
---
 .../onebrc/CalculateAverage_gonix.java        | 170 ++++++++----------
 1 file changed, 70 insertions(+), 100 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
index 8349d00a7..90f43601d 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
@@ -133,78 +133,65 @@ Aggregator processChunkLongs(ByteBuffer buf, int limit) {
 
             int start = pos;
             int hash = 0;
+            long tail = 0;
             while (true) {
-                // This is a bit ugly, but it is faster than reading by byte.
+                // Seen this trick used in multiple other solutions.
+                // Nice breakdown here: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
                 long tmpLong = buf.getLong(pos);
-                if ((tmpLong & 0xFF) == ';') {
-                    break;
+                long match = tmpLong ^ 0x3B3B3B3B_3B3B3B3BL; // 3B == ';'
+                match = ((match - 0x01010101_01010101L) & (~match & 0x80808080_80808080L));
+                if (match == 0) {
+                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFFFF);
+                    pos += 8;
+                    continue;
                 }
-                if (((tmpLong >>> 8) & 0xFF) == ';') {
-                    hash = (33 * hash) ^ (int) (tmpLong & 0xFF);
-                    pos += 1;
-                    break;
-                }
-                if (((tmpLong >>> 16) & 0xFF) == ';') {
-                    hash = (33 * hash) ^ (int) (tmpLong & 0xFFFF);
-                    pos += 2;
-                    break;
-                }
-                if (((tmpLong >>> 24) & 0xFF) == ';') {
-                    hash = (33 * hash) ^ (int) (tmpLong & 0xFFFFFF);
-                    pos += 3;
-                    break;
-                }
-                if (((tmpLong >>> 32) & 0xFF) == ';') {
-                    hash = (33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF);
-                    pos += 4;
-                    break;
-                }
-                if (((tmpLong >>> 40) & 0xFF) == ';') {
-                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFF);
-                    pos += 5;
-                    break;
-                }
-                if (((tmpLong >>> 48) & 0xFF) == ';') {
-                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFF);
-                    pos += 6;
-                    break;
-                }
-                if (((tmpLong >>> 56) & 0xFF) == ';') {
-                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFF);
-                    pos += 7;
-                    break;
-                }
-                hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFFFF);
-                pos += 8;
+
+                int tailBits = Long.numberOfTrailingZeros(match >>> 7);
+                long tailMask = ~(-1L << tailBits);
+                tail = tmpLong & tailMask;
+                hash = ((33 * hash) ^ (int) (tail & 0xFFFFFFFF)) + (int) ((tail >>> 33) & 0xFFFFFFFF);
+                pos += tailBits >> 3;
+                break;
             }
             hash = (33 * hash) ^ (hash >>> 15);
-            int len = pos - start;
-            assert (buf.get(pos) == ';') : "Expected ';'";
+            int lenInLongs = (pos - start) >> 3;
+            long tailAndLen = (tail << 8) | (lenInLongs & 0xFF);
+            // assert (buf.get(pos) == ';') : "Expected ';'";
             pos++;
 
             int measurement;
             {
+                // Seen this trick used in multiple other solutions.
+                // Looks like the original author is @merykitty.
                 long tmpLong = buf.getLong(pos);
-                int sign = 1;
-                if ((tmpLong & 0xFF) == '-') {
-                    sign = -1;
-                    tmpLong >>>= 8;
-                    pos++;
-                }
-                int value;
-                if (((tmpLong >>> 8) & 0xFF) == '.') {
-                    value = (int) (((tmpLong & 0xFF) - '0') * 10 + (((tmpLong >>> 16) & 0xFF) - '0'));
-                    pos += 4;
-                }
-                else {
-                    value = (int) (((tmpLong & 0xFF) - '0') * 100 + (((tmpLong >>> 8) & 0xFF) - '0') * 10 + (((tmpLong >>> 24) & 0xFF) - '0'));
-                    pos += 5;
-                }
-                measurement = sign * value;
+
+                // The 4th binary digit of the ascii of a digit is 1 while
+                // that of the '.' is 0. This finds the decimal separator
+                // The value can be 12, 20, 28
+                int decimalSepPos = Long.numberOfTrailingZeros(~tmpLong & 0x10101000);
+                int shift = 28 - decimalSepPos;
+                // signed is -1 if negative, 0 otherwise
+                long signed = (~tmpLong << 59) >> 63;
+                long designMask = ~(signed & 0xFF);
+                // Align the number to a specific position and transform the ascii code
+                // to actual digit value in each byte
+                long digits = ((tmpLong & designMask) << shift) & 0x0F000F0F00L;
+
+                // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+                // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+                // 0x000000UU00TTHH00 +
+                // 0x00UU00TTHH000000 * 10 +
+                // 0xUU00TTHH00000000 * 100
+                // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
+                // This results in our value lies in the bit 32 to 41 of this product
+                // That was close :)
+                long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+                measurement = (int) ((absValue ^ signed) - signed);
+                pos += (decimalSepPos >>> 3) + 3;
             }
-            assert (buf.get(pos - 1) == '\n') : "Expected '\\n'";
+            // assert (buf.get(pos - 1) == '\n') : "Expected '\\n'";
 
-            add(buf, start, len, hash, measurement);
+            add(buf, start, tailAndLen, hash, measurement);
         }
 
         return this;
@@ -216,13 +203,13 @@ public Stream<Entry> stream() {
                 .mapToObj(offset -> new Entry(mem, offset));
     }
 
-    private void add(ByteBuffer buf, int start, int len, int hash, int measurement) {
+    private void add(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
         int idx = hash & INDEX_MASK;
         while (true) {
             if (index[idx] != 0) {
                 int offset = index[idx];
-                if (keyEqual(offset, buf, start, len)) {
-                    int pos = offset + (len >> 3) + 2;
+                if (keyEqual(offset, buf, start, tailAndLen)) {
+                    int pos = offset + (int) (tailAndLen & 0xFF) + 1;
                     mem[pos + FLD_MIN] = Math.min((int) measurement, (int) mem[pos + FLD_MIN]);
                     mem[pos + FLD_MAX] = Math.max((int) measurement, (int) mem[pos + FLD_MAX]);
                     mem[pos + FLD_SUM] += measurement;
@@ -231,39 +218,27 @@ private void add(ByteBuffer buf, int start, int len, int hash, int measurement)
                 }
             }
             else {
-                index[idx] = create(buf, start, len, hash, measurement);
+                index[idx] = create(buf, start, tailAndLen, hash, measurement);
                 return;
             }
             idx = (idx + 1) & INDEX_MASK;
         }
     }
 
-    private int create(ByteBuffer buf, int start, int len, int hash, int measurement) {
+    private int create(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
         int offset = memUsed;
 
-        mem[offset] = len;
+        mem[offset] = tailAndLen;
 
         int memPos = offset + 1;
-        int memEndEarly = memPos + (len >> 3);
+        int memEnd = memPos + (int) (tailAndLen & 0xFF);
         int bufPos = start;
-        int bufEnd = start + len;
-        while (memPos < memEndEarly) {
+        while (memPos < memEnd) {
             mem[memPos] = buf.getLong(bufPos);
             memPos += 1;
             bufPos += 8;
         }
-        if (bufPos < bufEnd) {
-            int shift = (8 - (len & 7)) << 3; // (8 - (len % 8)) * 8
-            long tmpLong = buf.getLong(bufPos) << shift >>> shift;
-            mem[memPos] = tmpLong;
-        }
-        else {
-            // "consume" extra long - makes math a bit simpler to calculate
-            // fields offset for update.
-            mem[memPos] = 0;
-        }
 
-        memPos += 1;
         mem[memPos + FLD_MIN] = measurement;
         mem[memPos + FLD_MAX] = measurement;
         mem[memPos + FLD_SUM] = measurement;
@@ -273,28 +248,21 @@ private int create(ByteBuffer buf, int start, int len, int hash, int measurement
         return offset;
     }
 
-    private boolean keyEqual(int offset, ByteBuffer buf, int start, int len) {
-        if (len != mem[offset]) {
+    private boolean keyEqual(int offset, ByteBuffer buf, int start, long tailAndLen) {
+
+        if (mem[offset] != tailAndLen) {
             return false;
         }
         int memPos = offset + 1;
-        int memEndEarly = memPos + (len >> 3);
+        int memEnd = memPos + (int) (tailAndLen & 0xFF);
         int bufPos = start;
-        int bufEnd = start + len;
-        while (memPos < memEndEarly) {
+        while (memPos < memEnd) {
             if (mem[memPos] != buf.getLong(bufPos)) {
                 return false;
             }
             memPos += 1;
             bufPos += 8;
         }
-        if (bufPos < bufEnd) {
-            int shift = (8 - (len & 7)) << 3; // (8 - (len % 8)) * 8
-            long tmpLong = buf.getLong(bufPos) << shift >>> shift;
-            if (mem[memPos] != tmpLong) {
-                return false;
-            }
-        }
         return true;
     }
 
@@ -311,19 +279,22 @@ public static class Entry {
         public String getKey() {
             if (key == null) {
                 int pos = this.offset;
-                int keyLen = (int) mem[pos++];
-                var tmpBuf = ByteBuffer.allocate(keyLen + 8).order(ByteOrder.nativeOrder());
-                for (int i = 0; i < keyLen; i += 8) {
+                long tailAndLen = mem[pos++];
+                int keyLen = (int) (tailAndLen & 0xFF);
+                var tmpBuf = ByteBuffer.allocate((keyLen << 3) + 8).order(ByteOrder.nativeOrder());
+                for (int i = 0; i < keyLen; i++) {
                     tmpBuf.putLong(mem[pos++]);
                 }
-                key = new String(tmpBuf.array(), 0, keyLen, StandardCharsets.UTF_8);
+                long tail = tailAndLen >>> 8;
+                tmpBuf.putLong(tail);
+                int keyLenBytes = (keyLen << 3) + 8 - (Long.numberOfLeadingZeros(tail) >> 3);
+                key = new String(tmpBuf.array(), 0, keyLenBytes, StandardCharsets.UTF_8);
             }
             return key;
         }
 
         public Entry add(Entry other) {
-            int keyLen = (int) mem[offset];
-            int fldOffset = (keyLen >> 3) + 2;
+            int fldOffset = (int) (mem[offset] & 0xFF) + 1;
             int pos = offset + fldOffset;
             int otherPos = other.offset + fldOffset;
             long[] otherMem = other.mem;
@@ -340,8 +311,7 @@ public Entry getValue() {
 
         @Override
         public String toString() {
-            int keyLen = (int) mem[offset];
-            int pos = offset + (keyLen >> 3) + 2;
+            int pos = offset + (int) (mem[offset] & 0xFF) + 1;
             return round(mem[pos + FLD_MIN])
                     + "/" + round(((double) mem[pos + FLD_SUM]) / mem[pos + FLD_COUNT])
                     + "/" + round(mem[pos + FLD_MAX]);

From 08541525cd582c11e4c5f6bdb9b2cc5581425023 Mon Sep 17 00:00:00 2001
From: MahmoudFawzyKhalil
 <73137611+MahmoudFawzyKhalil@users.noreply.github.com>
Date: Wed, 17 Jan 2024 22:15:34 +0200
Subject: [PATCH 052/268] MahmoudFawzyKhalil's implementation (#438)

* Initial commit trying out multiple things

* Clean up code

* Fix rounding error to fix failing test
---
 calculate_average_MahmoudFawzyKhalil.sh       |  19 ++
 .../CalculateAverage_MahmoudFawzyKhalil.java  | 190 ++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100755 calculate_average_MahmoudFawzyKhalil.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java

diff --git a/calculate_average_MahmoudFawzyKhalil.sh b/calculate_average_MahmoudFawzyKhalil.sh
new file mode 100755
index 000000000..761d7e675
--- /dev/null
+++ b/calculate_average_MahmoudFawzyKhalil.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_MahmoudFawzyKhalil
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java b/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java
new file mode 100644
index 000000000..6eb426a15
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.channels.FileChannel;
+import java.util.*;
+import java.util.concurrent.ForkJoinPool;
+
+// Solution using project Panama and Map Reduce
+public class CalculateAverage_MahmoudFawzyKhalil {
+
+    private static final String FILE = "./measurements.txt";
+
+    public static void main(String[] args) throws Exception {
+        mapReduce();
+    }
+
+    private static void mapReduce() throws IOException {
+        var f = new File(FILE);
+        try (var raf = new RandomAccessFile(f, "r")) {
+            FileChannel channel = raf.getChannel();
+            long fileSize = channel.size();
+            MemorySegment ms = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
+            long chunkSize = fileSize / ForkJoinPool.commonPool().getParallelism();
+            List<Chunk> chunks = getChunks(ms, chunkSize);
+            Map<String, MeasurementAggregate> result = chunks.stream()
+                    .parallel()
+                    .map(c -> readChunkToMap(c, ms))
+                    .reduce(Collections.emptyMap(), (a, b) -> combine(a, b));
+            System.out.println(new TreeMap<>(result));
+        }
+    }
+
+    private static List<Chunk> getChunks(MemorySegment ms, long chunkSize) {
+        List<Chunk> chunks = new ArrayList<>(32);
+        long start = 0;
+        long fileSize = ms.byteSize();
+        long end = chunkSize;
+
+        while (start < fileSize) {
+            byte b = ms.get(ValueLayout.JAVA_BYTE, end);
+            if (b == '\n') {
+                chunks.add(new Chunk(start, end));
+                start = end + 1;
+                end = Math.min(end + chunkSize, fileSize - 2);
+            }
+            end++;
+        }
+        return chunks;
+    }
+
+    private static Map<String, MeasurementAggregate> readChunkToMap(Chunk chunk, MemorySegment ms) {
+        Map<String, MeasurementAggregate> map = new HashMap<>();
+
+        long start = chunk.start();
+        while (start < chunk.end()) {
+            long cityNameSize = 0;
+            while (ms.get(ValueLayout.JAVA_BYTE, start + cityNameSize) != ';') {
+                cityNameSize++;
+            }
+
+            String cityName = readString(ms, start, cityNameSize);
+            start = start + cityNameSize + 1;
+
+            long temperatureSize = 0;
+            while (ms.get(ValueLayout.JAVA_BYTE, start + temperatureSize) != '\n') {
+                temperatureSize++;
+            }
+
+            String temperature = readString(ms, start, temperatureSize);
+            start = start + temperatureSize + 1;
+
+            // System.out.println(STR."\{cityName};\{temperature}");
+            addMeasurement(map, cityName, temperature);
+        }
+
+        return map;
+    }
+
+    // Credit goes to imrafaelmerino for combine function
+    private static Map<String, MeasurementAggregate> combine(Map<String, MeasurementAggregate> xs, Map<String, MeasurementAggregate> ys) {
+        Map<String, MeasurementAggregate> result = new HashMap<>();
+
+        for (var key : xs.keySet()) {
+            var m1 = xs.get(key);
+            var m2 = ys.get(key);
+            var combined = (m2 == null) ? m1 : (m1 == null) ? m2 : m1.combine(m2);
+            result.put(key, combined);
+        }
+
+        for (var key : ys.keySet())
+            result.putIfAbsent(key, ys.get(key));
+        return result;
+    }
+
+    private static String readString(MemorySegment ms, long start, long size) {
+        byte[] stringBytes = ms.asSlice(start, size)
+                .toArray(ValueLayout.JAVA_BYTE);
+        return new String(stringBytes);
+    }
+
+    private static void addMeasurement(Map<String, MeasurementAggregate> measurements, String station, String reading) {
+        measurements.compute(station,
+                (_, oldMeasurements) -> oldMeasurements == null ? MeasurementAggregate.of(reading) : oldMeasurements.update(reading));
+    }
+
+    record Chunk(long start, long end) {
+    }
+
+    private static final class MeasurementAggregate {
+        private double min;
+        private double max;
+        private double sum;
+        private long count;
+
+        private MeasurementAggregate(double min, double max, double sum, long count) {
+            this.min = min;
+            this.max = max;
+            this.sum = sum;
+            this.count = count;
+        }
+
+        public static MeasurementAggregate of(String temperature) {
+            double measurement = Double.parseDouble(temperature);
+            return new MeasurementAggregate(measurement, measurement, measurement, 1);
+        }
+
+        @Override
+        public boolean equals(Object obj) {
+            if (obj == this)
+                return true;
+            if (obj == null || obj.getClass() != this.getClass())
+                return false;
+            var that = (MeasurementAggregate) obj;
+            return Double.doubleToLongBits(this.min) == Double.doubleToLongBits(that.min) &&
+                    Double.doubleToLongBits(this.max) == Double.doubleToLongBits(that.max) &&
+                    Double.doubleToLongBits(this.sum) == Double.doubleToLongBits(that.sum) &&
+                    this.count == that.count;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(min, max, sum, count);
+        }
+
+        public MeasurementAggregate update(String part) {
+            double measurement = Double.parseDouble(part);
+            this.min = Math.min(this.min, measurement);
+            this.max = Math.max(this.max, measurement);
+            this.sum += measurement;
+            this.count++;
+            return this;
+        }
+
+        public String toString() {
+            return min + "/" + round(round(sum) / count) + "/" + max;
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        public MeasurementAggregate combine(MeasurementAggregate m2) {
+            return new MeasurementAggregate(
+                    Math.min(this.min, m2.min),
+                    Math.max(this.max, m2.max),
+                    this.sum + m2.sum,
+                    this.count + m2.count);
+        }
+    }
+}

From aee71b961d8e9b799020d819e370edfc51c82576 Mon Sep 17 00:00:00 2001
From: Matteo Vaccari <vaccari@pobox.com>
Date: Wed, 17 Jan 2024 21:26:19 +0100
Subject: [PATCH 053/268] My own solution -- memory mapping the files, running
 in parallel threads, using a state machine to parse the file (#466)

* Golang implementation

* Speed up by avoiding copying the lines

* Memory mapping

* Add script for testing

* Now passing most of the tests

* Refactor to composed method

* Now using integer math throughout

* Now using a state machine for parsing!

* Refactoring state names

* Enabling profiling

* Running in parallel!

* Fully parallel!

* Refactor

* Improve type safety of methods

* The rounding problem is due to difference between Javas and Gos printf implementation

* Converting my solution to Java

* Merging results

* Splitting the file in several buffers

* Made it parallel!

* Removed test file

* Removed go implementation

* Removed unused files

* Add header to .sh file

---------

Co-authored-by: Matteo Vaccari <mvaccari@thoughtworks.com>
---
 calculate_average_xpmatteo.sh                 |  20 ++
 .../onebrc/CalculateAverage_xpmatteo.java     | 261 ++++++++++++++++++
 2 files changed, 281 insertions(+)
 create mode 100755 calculate_average_xpmatteo.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java

diff --git a/calculate_average_xpmatteo.sh b/calculate_average_xpmatteo.sh
new file mode 100755
index 000000000..d1cd87039
--- /dev/null
+++ b/calculate_average_xpmatteo.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_xpmatteo
+
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java b/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java
new file mode 100644
index 000000000..94904ffb4
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java
@@ -0,0 +1,261 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+@SuppressWarnings({ "ReassignedVariable", "StatementWithEmptyBody" })
+public class CalculateAverage_xpmatteo {
+
+    private static final String FILE = "./measurements.txt";
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        var fileName = dataFileName(args);
+
+        try (
+                var file = new RandomAccessFile(new File(fileName), "r");
+                var channel = file.getChannel()) {
+            var numCpus = Runtime.getRuntime().availableProcessors();
+            var threads = split(channel, numCpus).stream()
+                    .map(Worker::new)
+                    .toList();
+            threads.forEach(Thread::start);
+            for (Worker thread : threads) {
+                thread.join();
+            }
+            var results = threads.stream().map(Worker::getResults)
+                    .reduce(CalculateAverage_xpmatteo::merge)
+                    .orElseThrow();
+            printCities(results);
+        }
+    }
+
+    public static class Worker extends Thread {
+        private final ByteBuffer buffer;
+        private Results results;
+
+        public Worker(ByteBuffer buffer) {
+            this.buffer = buffer;
+        }
+
+        @Override
+        public void run() {
+            this.results = parseData(this.buffer);
+        }
+
+        public Results getResults() {
+            return results;
+        }
+    }
+
+    protected static List<ByteBuffer> split(FileChannel channel, int numCpus) throws IOException {
+        if (channel.size() < 10_000) {
+            return List.of(channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()));
+        }
+
+        long[] increments = new long[numCpus + 1];
+        for (int i = 0; i < numCpus; i++) {
+            increments[i] = i * channel.size() / numCpus;
+            // adjust the increments so that they start on the beginning of a city
+            while (increments[i] > 0 && byteAt(channel, increments[i] - 1) != '\n') {
+                increments[i]--;
+            }
+        }
+        increments[numCpus] = channel.size();
+
+        List<ByteBuffer> result = new ArrayList<>(numCpus);
+        for (int i = 0; i < numCpus; i++) {
+            long from = increments[i];
+            long to = increments[i + 1];
+            result.add(channel.map(FileChannel.MapMode.READ_ONLY, from, to - from));
+        }
+        return result;
+    }
+
+    private static byte byteAt(FileChannel channel, long offset) throws IOException {
+        ByteBuffer buf = ByteBuffer.allocate(1);
+        channel.position(offset);
+        channel.read(buf);
+        buf.flip();
+        var bytes = new byte[1];
+        buf.get(bytes);
+        return bytes[0];
+    }
+
+    public static String dataFileName(String[] args) {
+        if (args.length == 1) {
+            return args[0];
+        }
+        return FILE;
+    }
+
+    protected static byte[] readAllData(String fileName) throws IOException {
+        return Files.readAllBytes(Path.of(fileName));
+    }
+
+    protected static ByteBuffer memoryMap(String fileName) throws IOException {
+        try (RandomAccessFile file = new RandomAccessFile(new File(fileName), "r")) {
+            // Get file channel in read-only mode
+            FileChannel fileChannel = file.getChannel();
+
+            return fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
+        }
+    }
+
+    protected enum State {
+        PARSING_CITY_NAME,
+        SKIPPING_SEMICOLON,
+        PARSING_TEMPERATURE
+    }
+
+    protected static Results parseData(ByteBuffer data) {
+        var results = new Results();
+        var state = State.PARSING_CITY_NAME;
+        int cityStartOffset = 0, cityEndOffset = 0;
+        int temp = 0, sign = 0;
+
+        for (int i = 0; i < data.limit(); i++) {
+            byte currentChar = data.get();
+            if (state == State.PARSING_CITY_NAME && currentChar == ';') {
+                state = State.SKIPPING_SEMICOLON;
+                cityEndOffset = i;
+            }
+            else if (state == State.PARSING_CITY_NAME) {
+                // do nothing
+            }
+            else if (state == State.SKIPPING_SEMICOLON && currentChar == '-') {
+                state = State.PARSING_TEMPERATURE;
+                temp = 0;
+                sign = -1;
+            }
+            else if (state == State.SKIPPING_SEMICOLON && currentChar >= '0' && currentChar <= '9') {
+                state = State.PARSING_TEMPERATURE;
+                temp = currentChar - '0';
+                sign = 1;
+            }
+            else if (state == State.PARSING_TEMPERATURE && currentChar >= '0' && currentChar <= '9') {
+                temp = temp * 10 + currentChar - '0';
+            }
+            else if (state == State.PARSING_TEMPERATURE && currentChar == '.') {
+                // do nothing
+            }
+            else if (state == State.PARSING_TEMPERATURE && currentChar == '\n') {
+                byte[] bytes = new byte[cityEndOffset - cityStartOffset];
+                data.get(cityStartOffset, bytes);
+                var cityName = new String(bytes);
+                accumulate(results, cityName, temp * sign);
+                state = State.PARSING_CITY_NAME;
+                cityStartOffset = i + 1;
+            }
+        }
+
+        return results;
+    }
+
+    private static void accumulate(Results results, String cityName, int tempTimesTen) {
+        var existing = results.get(cityName);
+        if (existing == null) {
+            results.put(cityName, new CityData(tempTimesTen, tempTimesTen, tempTimesTen, 1));
+        }
+        else {
+            existing.min = Math.min(existing.min, tempTimesTen);
+            existing.sum = existing.sum + tempTimesTen;
+            existing.max = Math.max(existing.max, tempTimesTen);
+            existing.count++;
+        }
+    }
+
+    protected static Results merge(Results a, Results b) {
+        for (var entry : b.entrySet()) {
+            CityData valueInA = a.get(entry.getKey());
+            if (null == valueInA) {
+                a.put(entry.getKey(), entry.getValue());
+            }
+            else {
+                var valueInB = entry.getValue();
+                valueInA.min = Math.min(valueInA.min, valueInB.min);
+                valueInA.sum += valueInB.sum;
+                valueInA.max = Math.max(valueInA.max, valueInB.max);
+                valueInA.count += valueInB.count;
+            }
+        }
+
+        return a;
+    }
+
+    protected static class Results extends TreeMap<String, CityData> {
+
+    }
+
+    protected static class CityData {
+        int min, sum, max, count;
+
+        public CityData(int min, int sum, int max, int count) {
+            this.min = min;
+            this.sum = sum;
+            this.max = max;
+            this.count = count;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o)
+                return true;
+            if (o == null || getClass() != o.getClass())
+                return false;
+            CityData cityData = (CityData) o;
+            return min == cityData.min && sum == cityData.sum && max == cityData.max && count == cityData.count;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(min, sum, max, count);
+        }
+
+        @Override
+        public String toString() {
+            return STR."CityData{min=\{min}, sum=\{sum}, max=\{max}, count=\{count}\{'}'}";
+        }
+    }
+
+    protected static void printCities(Results cities) {
+        System.out.print("{");
+        for (String city : cities.keySet()) {
+            CityData data = cities.get(city);
+            var min = data.min / 10.0;
+            var mean = (data.sum * 10.0 / data.count) / 100.0;
+            var max = data.max / 10.0;
+            System.out.printf(
+                    "%s=%.1f/%.1f/%.1f, ",
+                    city,
+                    min,
+                    mean,
+                    max);
+        }
+        System.out.print("}");
+    }
+}

From 199d6415bb66b5a3fb3e77958a056d1e4adcef5c Mon Sep 17 00:00:00 2001
From: Vemana <vemana.github@gmail.com>
Date: Thu, 18 Jan 2024 02:00:31 +0530
Subject: [PATCH 054/268] 10% improvement from parallelizing munmap(); jumps to
 around 12th from 16th based on local testing; no Unsafe; no bitwise tricks
 yet (#465)

* Squashing a bunch of commits together.

Commit#2; Uplift of 7% using native byteorder from ByteBuffer.
Commit#1: Minor changes to formatting.

* Commit #4: Parallelize munmap() and reduce completion time further by
10%. As the jvm exits with exit(0) syscall, the kernel reclaims the
memory mappings via munmap() call. Prior to this change. all the unmap()
calls were happening right at the end as the JVM exited. This led to
serial execution of about 350ms out of 2500 ms right at the end after
each shard completed its work. We can parallelize it by exposing the
Cleaner from MappedByteBuffer and then ensure that it is truly parallel
execution of munmap() by using a non-blocking lock (SeqLock). The
optimal strategy for when each thread must call unmap() is an interesting math problem with an exact solution and this code roughly reflects it.

Commit #3: Tried out reading long at a time from bytebuffer and
checking for presence of ';'.. it was slower compared to just reading int().
Removed the code for reading longs; just retaining the
hasSemicolonByte(..) check code

Commit #2: Introduce processLineSlow() and processRangeSlow() for the
tial part.

Commit #1: Create a separate tail piece of work for the last few lines to be
processed separately from the main loop. This allows the main loop to
read past its allocated range (by a 'long' if we reserve atleast 8 bytes
for the tail piece of work.)
---
 calculate_average_vemana.sh                   |   2 +
 .../onebrc/CalculateAverage_vemana.java       | 496 +++++++++++++++---
 2 files changed, 428 insertions(+), 70 deletions(-)

diff --git a/calculate_average_vemana.sh b/calculate_average_vemana.sh
index b3437f208..06a911a21 100755
--- a/calculate_average_vemana.sh
+++ b/calculate_average_vemana.sh
@@ -18,6 +18,8 @@
 # Basics
 JAVA_OPTS=""
 JAVA_OPTS="$JAVA_OPTS --enable-preview"
+JAVA_OPTS="$JAVA_OPTS --add-exports java.base/jdk.internal.ref=ALL-UNNAMED"
+JAVA_OPTS="$JAVA_OPTS --add-opens java.base/java.nio=ALL-UNNAMED"
 #JAVA_OPTS="$JAVA_OPTS --add-modules jdk.incubator.vector"
 #JAVA_OPTS="$JAVA_OPTS -XX:+UnlockDiagnosticVMOptions"
 
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
index d4f0a2fb8..8f690e349 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
@@ -17,21 +17,27 @@
 
 import java.io.IOException;
 import java.io.RandomAccessFile;
+import java.lang.reflect.Method;
+import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel.MapMode;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Optional;
 import java.util.TreeMap;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
 
@@ -46,6 +52,20 @@
  * potential run-time variance (i.e. std. deviation) penalties based on informal testing. Variance
  * is not ideal when trying to minimize the maximum worker latency.
  *
+ * <p>[Understand that unmapping is serial and runs in exit()]. This is very much about exploiting
+ * parallelism. After adding tracing (plain old printfs), it was clear that the JVM was taking 400ms
+ * (out of 1500ms) just to exit. Turns out that the kernel tries to unmap all the mappings as part
+ * of the exit() call. Even strace wouldn't report this because the unmapping is running as part of
+ * the exit() call. perf stat barely hinted at it, but we had more insights by actually running a
+ * couple of experiments: reduce touched pages --> JVM shutdown latency went down; manually run
+ * unmap() call to free up the ByteBuffers --> parallel execution doesn't help at all. From this it
+ * was conclusive that unmap() executes serially and the 400ms was being spent purely unmapping.
+ * Now, the challenge is to both (1) unmap a MappedByteBuffer (no such methods exposed) from code
+ * rather than via exit() syscall and (2) do it in parallel without causing lock contention. For 1,
+ * use Reflection and (2) is an interesting math problem with a provably optimal solution.
+ * Parallelism in munmap() is achieved by using a fast lock that prevents two threads from
+ * simultaneously cleaning (i.e. munmap()) the ByteBuffer.
+ *
  * <p>[Use ByteBuffers over MemorySegment] Each Shard is further divided in Chunks. This would've
  * been unnecessary except that Shards are too big to be backed by ByteBuffers. Besides,
  * MemorySegment appears slower than ByteBuffers. So, to use ByteBuffers, we have to use smaller
@@ -138,6 +158,14 @@
 public class CalculateAverage_vemana {
 
     public static void main(String[] args) throws Exception {
+        Tracing.recordAppStart();
+        Runtime.getRuntime()
+                .addShutdownHook(
+                        new Thread(
+                                () -> {
+                                    Tracing.recordEvent("In Shutdown hook");
+                                }));
+
         // First process in large chunks without coordination among threads
         // Use chunkSizeBits for the large-chunk size
         int chunkSizeBits = 20;
@@ -151,20 +179,32 @@ public static void main(String[] args) throws Exception {
         // Size of the hashtable (attempt to fit in L3)
         int hashtableSizeBits = 14;
 
-        if (args.length > 0) {
-            chunkSizeBits = Integer.parseInt(args[0]);
-        }
+        int minReservedBytesAtFileTail = 9;
 
-        if (args.length > 1) {
-            commonChunkFraction = Double.parseDouble(args[1]);
-        }
+        String inputFile = "measurements.txt";
 
-        if (args.length > 2) {
-            commonChunkSizeBits = Integer.parseInt(args[2]);
-        }
-
-        if (args.length > 3) {
-            hashtableSizeBits = Integer.parseInt(args[3]);
+        for (String arg : args) {
+            String key = arg.substring(0, arg.indexOf('='));
+            String value = arg.substring(key.length() + 1);
+            switch (key) {
+                case "chunkSizeBits":
+                    chunkSizeBits = Integer.parseInt(value);
+                    break;
+                case "commonChunkFraction":
+                    commonChunkFraction = Double.parseDouble(value);
+                    break;
+                case "commonChunkSizeBits":
+                    commonChunkSizeBits = Integer.parseInt(value);
+                    break;
+                case "hashtableSizeBits":
+                    hashtableSizeBits = Integer.parseInt(value);
+                    break;
+                case "inputfile":
+                    inputFile = value;
+                    break;
+                default:
+                    throw new IllegalArgumentException("Unknown argument: " + arg);
+            }
         }
 
         // System.err.println(STR."""
@@ -177,12 +217,15 @@ public static void main(String[] args) throws Exception {
 
         System.out.println(
                 new Runner(
-                        Path.of("measurements.txt"),
+                        Path.of(inputFile),
                         chunkSizeBits,
                         commonChunkFraction,
                         commonChunkSizeBits,
-                        hashtableSizeBits)
+                        hashtableSizeBits,
+                        minReservedBytesAtFileTail)
                                 .getSummaryStatistics());
+
+        Tracing.recordEvent("After printing result");
     }
 
   public record AggregateResult(Map<String, Stat> tempStats) {
@@ -202,7 +245,9 @@ public static class ByteRange {
         private static final int BUF_SIZE = 1 << 30;
 
         private final long fileSize;
+        private final long maxEndPos; // Treat as if the file ends here
         private final RandomAccessFile raf;
+        private final List<MappedByteBuffer> unclosedBuffers = new ArrayList<>();
 
         // ***************** What this is doing and why *****************
         // Reading from ByteBuffer appears faster from MemorySegment, but ByteBuffer can only be
@@ -220,7 +265,6 @@ public static class ByteRange {
         // tuning
         // - This enables (relatively) allocation free chunking implementation. Our chunking impl uses
         // fine grained chunking for the last say X% of work to avoid being hostage to stragglers
-
         // The PUBLIC API
         public MappedByteBuffer byteBuffer;
         public int endInBuf; // where the chunk ends inside the buffer
@@ -230,8 +274,9 @@ public static class ByteRange {
         private long bufferStart; // byteBuffer's begin coordinate
 
         // Uninitialized; for mutability
-        public ByteRange(RandomAccessFile raf) {
+        public ByteRange(RandomAccessFile raf, long maxEndPos) {
             this.raf = raf;
+            this.maxEndPos = maxEndPos;
             try {
                 this.fileSize = raf.length();
             }
@@ -241,6 +286,20 @@ public ByteRange(RandomAccessFile raf) {
             bufferEnd = bufferStart = -1;
         }
 
+        public void close(int shardIdx) {
+            Tracing.recordWorkStart("cleaner", shardIdx);
+            if (byteBuffer != null) {
+                unclosedBuffers.add(byteBuffer);
+            }
+            for (MappedByteBuffer buf : unclosedBuffers) {
+                close(buf);
+            }
+            unclosedBuffers.clear();
+            bufferEnd = bufferStart = -1;
+            byteBuffer = null;
+            Tracing.recordWorkEnd("cleaner", shardIdx);
+        }
+
         public void setRange(long rangeStart, long rangeEnd) {
             if (rangeEnd + 1024 > bufferEnd || rangeStart < bufferStart) {
                 bufferStart = rangeStart;
@@ -251,12 +310,15 @@ public void setRange(long rangeStart, long rangeEnd) {
             if (rangeStart > 0) {
                 rangeStart = 1 + nextNewLine(rangeStart);
             }
+            else {
+                rangeStart = 0;
+            }
 
-            if (rangeEnd < fileSize) {
+            if (rangeEnd < maxEndPos) {
                 rangeEnd = 1 + nextNewLine(rangeEnd);
             }
             else {
-                rangeEnd = fileSize;
+                rangeEnd = maxEndPos;
             }
 
             startInBuf = (int) (rangeStart - bufferStart);
@@ -267,12 +329,24 @@ public void setRange(long rangeStart, long rangeEnd) {
     public String toString() {
       return STR."""
         ByteRange {
+          bufferStart = \{bufferStart}
+          bufferEnd = \{bufferEnd}
           startInBuf = \{startInBuf}
           endInBuf = \{endInBuf}
         }
         """;
     }
 
+        private void close(MappedByteBuffer buffer) {
+            Method cleanerMethod = Reflection.findMethodNamed(buffer, "cleaner");
+            cleanerMethod.setAccessible(true);
+            Object cleaner = Reflection.invoke(buffer, cleanerMethod);
+
+            Method cleanMethod = Reflection.findMethodNamed(cleaner, "clean");
+            cleanMethod.setAccessible(true);
+            Reflection.invoke(cleaner, cleanMethod);
+        }
+
         private long nextNewLine(long pos) {
             int nextPos = (int) (pos - bufferStart);
             while (byteBuffer.get(nextPos) != '\n') {
@@ -282,6 +356,9 @@ private long nextNewLine(long pos) {
         }
 
         private void setByteBufferToRange(long start, long end) {
+            if (byteBuffer != null) {
+                unclosedBuffers.add(byteBuffer);
+            }
             try {
                 byteBuffer = raf.getChannel().map(MapMode.READ_ONLY, start, end - start);
                 byteBuffer.order(ByteOrder.nativeOrder());
@@ -306,15 +383,41 @@ private Checks() {
 
     public interface LazyShardQueue {
 
+        void close(int shardIdx);
+
+        Optional<ByteRange> fileTailEndWork(int idx);
+
         ByteRange take(int shardIdx);
     }
 
+    static final class Reflection {
+
+        static Method findMethodNamed(Object object, String name, Class... paramTypes) {
+            try {
+                return object.getClass().getMethod(name, paramTypes);
+            }
+            catch (NoSuchMethodException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        static Object invoke(Object receiver, Method method, Object... params) {
+            try {
+                return method.invoke(receiver, params);
+            }
+            catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
     public static class Runner {
 
         private final double commonChunkFraction;
         private final int commonChunkSizeBits;
         private final int hashtableSizeBits;
         private final Path inputFile;
+        private final int minReservedBytesAtFileTail;
         private final int shardSizeBits;
 
         public Runner(
@@ -322,45 +425,74 @@ public Runner(
                       int chunkSizeBits,
                       double commonChunkFraction,
                       int commonChunkSizeBits,
-                      int hashtableSizeBits) {
+                      int hashtableSizeBits,
+                      int minReservedBytesAtFileTail) {
             this.inputFile = inputFile;
             this.shardSizeBits = chunkSizeBits;
             this.commonChunkFraction = commonChunkFraction;
             this.commonChunkSizeBits = commonChunkSizeBits;
             this.hashtableSizeBits = hashtableSizeBits;
+            this.minReservedBytesAtFileTail = minReservedBytesAtFileTail;
         }
 
         AggregateResult getSummaryStatistics() throws Exception {
-            int processors = Runtime.getRuntime().availableProcessors();
+            int nThreads = Runtime.getRuntime().availableProcessors();
             LazyShardQueue shardQueue = new SerialLazyShardQueue(
-                    1L << shardSizeBits, inputFile, processors, commonChunkFraction, commonChunkSizeBits);
+                    1L << shardSizeBits,
+                    inputFile,
+                    nThreads,
+                    commonChunkFraction,
+                    commonChunkSizeBits,
+                    minReservedBytesAtFileTail);
 
             List<Future<AggregateResult>> results = new ArrayList<>();
             ExecutorService executorService = Executors.newFixedThreadPool(
-                    processors,
+                    nThreads,
                     runnable -> {
                         Thread thread = new Thread(runnable);
                         thread.setDaemon(true);
                         return thread;
                     });
 
-            long[] finishTimes = new long[processors];
-
-            for (int i = 0; i < processors; i++) {
-                final int I = i;
+            for (int i = 0; i < nThreads; i++) {
+                final int shardIdx = i;
                 final Callable<AggregateResult> callable = () -> {
-                    AggregateResult result = new ShardProcessor(shardQueue, hashtableSizeBits, I).processShard();
-                    finishTimes[I] = System.nanoTime();
+                    Tracing.recordWorkStart("shard", shardIdx);
+                    AggregateResult result = new ShardProcessor(shardQueue, hashtableSizeBits, shardIdx).processShard();
+                    Tracing.recordWorkEnd("shard", shardIdx);
                     return result;
                 };
                 results.add(executorService.submit(callable));
             }
-            // printFinishTimes(finishTimes);
-            return executorService.submit(() -> merge(results)).get();
+            Tracing.recordEvent("Basic push time");
+
+            AggregateResult result = executorService.submit(() -> merge(results)).get();
+
+            Tracing.recordEvent("Merge results received");
+
+            // Note that munmap() is serial and not parallel
+            executorService.submit(
+                    () -> {
+                        for (int i = 0; i < nThreads; i++) {
+                            shardQueue.close(i);
+                        }
+                    });
+
+            Tracing.recordEvent("Waiting for executor shutdown");
+
+            executorService.shutdown();
+            executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
+
+            Tracing.recordEvent("Executor terminated");
+            Tracing.analyzeWorkThreads("cleaner", nThreads);
+            Tracing.recordEvent("After cleaner finish printed");
+
+            return result;
         }
 
         private AggregateResult merge(List<Future<AggregateResult>> results)
                 throws ExecutionException, InterruptedException {
+            Tracing.recordEvent("Merge start time");
             Map<String, Stat> output = null;
             boolean[] isDone = new boolean[results.size()];
             int remaining = results.size();
@@ -383,51 +515,55 @@ private AggregateResult merge(List<Future<AggregateResult>> results)
                     }
                 }
             }
+            Tracing.recordEvent("Merge end time");
+            Tracing.analyzeWorkThreads("shard", results.size());
             return new AggregateResult(output);
         }
-
-    private void printFinishTimes(long[] finishTimes) {
-      Arrays.sort(finishTimes);
-      int n = finishTimes.length;
-      System.err.println(
-          STR."Finish Delta: \{(finishTimes[n - 1] - finishTimes[0]) / 1_000_000}ms");
-    }
     }
 
     public static class SerialLazyShardQueue implements LazyShardQueue {
 
-        private static long roundToNearestHigherMultipleOf(long divisor, long value) {
-            return (value + divisor - 1) / divisor * divisor;
+        private static long roundToNearestLowerMultipleOf(long divisor, long value) {
+            return value / divisor * divisor;
         }
 
         private final ByteRange[] byteRanges;
         private final long chunkSize;
         private final long commonChunkSize;
         private final AtomicLong commonPool;
+        private final long effectiveFileSize;
         private final long fileSize;
-        private final long[] nextStarts;
+        private final long[] perThreadData;
+        private final RandomAccessFile raf;
+        private final SeqLock seqLock;
 
         public SerialLazyShardQueue(
                                     long chunkSize,
                                     Path filePath,
                                     int shards,
                                     double commonChunkFraction,
-                                    int commonChunkSizeBits)
+                                    int commonChunkSizeBits,
+                                    int fileTailReservedBytes)
                 throws IOException {
             Checks.checkArg(commonChunkFraction < 0.9 && commonChunkFraction >= 0);
-            var raf = new RandomAccessFile(filePath.toFile(), "r");
+            Checks.checkArg(fileTailReservedBytes >= 0);
+            this.raf = new RandomAccessFile(filePath.toFile(), "r");
             this.fileSize = raf.length();
+            fileTailReservedBytes = fileTailReservedBytes == 0
+                    ? 0
+                    : consumeToPreviousNewLineExclusive(raf, fileTailReservedBytes);
+            this.effectiveFileSize = fileSize - fileTailReservedBytes;
 
             // Common pool
             long commonPoolStart = Math.min(
-                    roundToNearestHigherMultipleOf(
-                            chunkSize, (long) (fileSize * (1 - commonChunkFraction))),
-                    fileSize);
+                    roundToNearestLowerMultipleOf(
+                            chunkSize, (long) (effectiveFileSize * (1 - commonChunkFraction))),
+                    effectiveFileSize);
             this.commonPool = new AtomicLong(commonPoolStart);
             this.commonChunkSize = 1L << commonChunkSizeBits;
 
             // Distribute chunks to shards
-            this.nextStarts = new long[shards << 4]; // thread idx -> 16*idx to avoid cache line conflict
+            this.perThreadData = new long[shards << 4]; // thread idx -> 16*idx to avoid cache line conflict
             for (long i = 0,
                     currentStart = 0,
                     remainingChunks = (commonPoolStart + chunkSize - 1) / chunkSize; i < shards; i++) {
@@ -435,8 +571,17 @@ public SerialLazyShardQueue(
                 long currentChunks = (remainingChunks + remainingShards - 1) / remainingShards;
                 // Shard i handles: [currentStart, currentStart + currentChunks * chunkSize)
                 int pos = (int) i << 4;
-                nextStarts[pos] = currentStart;
-                nextStarts[pos + 1] = currentStart + currentChunks * chunkSize;
+                perThreadData[pos] = currentStart; // next chunk begin
+                perThreadData[pos + 1] = currentStart + currentChunks * chunkSize; // shard end
+                perThreadData[pos + 2] = currentChunks; // active chunks remaining
+                // threshold below which need to shrink
+                // 0.03 is a practical number but the optimal strategy is this:
+                // Shard number N (1-based) should unmap as soon as it completes (R/(R+1))^N fraction of
+                // its work, where R = relative speed of unmap compared to the computation.
+                // For our problem, R ~ 75 because unmap unmaps 30GB/sec (but, it is serial) while
+                // cores go through data at the rate of 400MB/sec.
+                perThreadData[pos + 3] = (long) (currentChunks * (0.03 * (shards - i)));
+                perThreadData[pos + 4] = 1;
                 currentStart += currentChunks * chunkSize;
                 remainingChunks -= currentChunks;
             }
@@ -444,53 +589,128 @@ public SerialLazyShardQueue(
 
             this.byteRanges = new ByteRange[shards << 4];
             for (int i = 0; i < shards; i++) {
-                byteRanges[i << 4] = new ByteRange(raf);
+                byteRanges[i << 4] = new ByteRange(raf, effectiveFileSize);
             }
+
+            this.seqLock = new SeqLock();
         }
 
         @Override
-        public ByteRange take(int idx) {
-            // Try for thread local range
-            final int pos = idx << 4;
-            long rangeStart = nextStarts[pos];
-            final long chunkEnd = nextStarts[pos + 1];
+        public void close(int shardIdx) {
+            byteRanges[shardIdx << 4].close(shardIdx);
+        }
 
+        @Override
+        public Optional<ByteRange> fileTailEndWork(int idx) {
+            if (idx == 0 && effectiveFileSize < fileSize) {
+                ByteRange chunk = new ByteRange(raf, fileSize);
+                chunk.setRange(
+                        effectiveFileSize == 0 ? 0 : effectiveFileSize - 1 /* will consume newline at eFS-1 */,
+                        fileSize);
+                return Optional.of(chunk);
+            }
+            return Optional.empty();
+        }
+
+        @Override
+        public ByteRange take(int shardIdx) {
+            // Try for thread local range
+            final int pos = shardIdx << 4;
+            long rangeStart = perThreadData[pos];
+            final long chunkEnd = perThreadData[pos + 1];
             final long rangeEnd;
 
             if (rangeStart < chunkEnd) {
                 rangeEnd = rangeStart + chunkSize;
-                nextStarts[pos] = rangeEnd;
+                perThreadData[pos] = rangeEnd;
+                perThreadData[pos + 2]--;
             }
             else {
                 rangeStart = commonPool.getAndAdd(commonChunkSize);
                 // If that's exhausted too, nothing remains!
-                if (rangeStart >= fileSize) {
+                if (rangeStart >= effectiveFileSize) {
                     return null;
                 }
                 rangeEnd = rangeStart + commonChunkSize;
             }
 
+            if (perThreadData[pos + 2] <= perThreadData[pos + 3] && perThreadData[pos + 4] > 0) {
+                if (attemptClose(shardIdx)) {
+                    perThreadData[pos + 4]--;
+                }
+            }
+
             ByteRange chunk = byteRanges[pos];
             chunk.setRange(rangeStart, rangeEnd);
             return chunk;
         }
+
+        private boolean attemptClose(int shardIdx) {
+            if (seqLock.acquire()) {
+                byteRanges[shardIdx << 4].close(shardIdx);
+                seqLock.release();
+                return true;
+            }
+            return false;
+        }
+
+        private int consumeToPreviousNewLineExclusive(RandomAccessFile raf, int minReservedBytes) {
+            try {
+                long pos = Math.max(raf.length() - minReservedBytes - 1, -1);
+                if (pos < 0) {
+                    return (int) raf.length();
+                }
+
+                long start = Math.max(pos - 512, 0);
+                ByteBuffer buf = raf.getChannel().map(MapMode.READ_ONLY, start, pos + 1 - start);
+                while (pos >= 0 && buf.get((int) (pos - start)) != '\n') {
+                    pos--;
+                }
+                pos++;
+                return (int) (raf.length() - pos);
+            }
+            catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    /** A low-traffic non-blocking lock. */
+    static class SeqLock {
+
+        private final AtomicBoolean isOccupied = new AtomicBoolean(false);
+
+        boolean acquire() {
+            return !isOccupied.get() && isOccupied.compareAndSet(false, true);
+        }
+
+        void release() {
+            isOccupied.set(false);
+        }
     }
 
     public static class ShardProcessor {
 
+        private final int shardIdx;
         private final LazyShardQueue shardQueue;
         private final ShardProcessorState state;
-        private final int threadIdx;
 
-        public ShardProcessor(LazyShardQueue shardQueue, int hashtableSizeBits, int threadIdx) {
+        public ShardProcessor(LazyShardQueue shardQueue, int hashtableSizeBits, int shardIdx) {
             this.shardQueue = shardQueue;
-            this.threadIdx = threadIdx;
+            this.shardIdx = shardIdx;
             this.state = new ShardProcessorState(hashtableSizeBits);
         }
 
         public AggregateResult processShard() {
+            return processShardReal();
+        }
+
+        public AggregateResult processShardReal() {
+            // First process the file tail work to give ourselves freedom to go past ranges in parsing
+            shardQueue.fileTailEndWork(shardIdx).ifPresent(this::processRangeSlow);
+
             ByteRange range;
-            while ((range = shardQueue.take(threadIdx)) != null) {
+            while ((range = shardQueue.take(shardIdx)) != null) {
                 processRange(range);
             }
             return result();
@@ -506,6 +726,13 @@ private void processRange(ByteRange range) {
             }
         }
 
+        private void processRangeSlow(ByteRange range) {
+            int nextPos = range.startInBuf;
+            while (nextPos < range.endInBuf) {
+                nextPos = state.processLineSlow(range.byteBuffer, nextPos);
+            }
+        }
+
         private AggregateResult result() {
             return state.result();
         }
@@ -513,8 +740,9 @@ private AggregateResult result() {
 
     public static class ShardProcessorState {
 
+        public static final long ONE_MASK = 0x0101010101010101L;
         private static final ByteOrder NATIVE_BYTE_ORDER = ByteOrder.nativeOrder();
-
+        private static final long SEMICOLON_MASK = 0x3b3b3b3b3b3b3b3bL;
         private final byte[][] cityNames;
         private final int slotsMask;
         private final Stat[] stats;
@@ -545,21 +773,21 @@ public int processLine(MappedByteBuffer mmb, int nextPos) {
                 byte b = (byte) (x >>> 8);
                 if (b == ';') {
                     nextPos += 2;
-                    hash = hash * 31 + ((0xFF & x));
+                    hash = hash * 31 + (0xFF & x);
                     break;
                 }
 
                 byte c = (byte) (x >>> 16);
                 if (c == ';') {
                     nextPos += 3;
-                    hash = hash * 31 + ((0xFFFF & x));
+                    hash = hash * 31 + (0xFFFF & x);
                     break;
                 }
 
                 byte d = (byte) (x >>> 24);
                 if (d == ';') {
                     nextPos += 4;
-                    hash = hash * 31 + ((0xFFFFFF & x));
+                    hash = hash * 31 + (0xFFFFFF & x);
                     break;
                 }
 
@@ -596,9 +824,47 @@ public int processLine(MappedByteBuffer mmb, int nextPos) {
             return nextPos;
         }
 
+        /** A slow version which is used only for the tail part of the file. */
+        public int processLineSlow(MappedByteBuffer mmb, int nextPos) {
+            int originalPos = nextPos;
+            byte nextByte;
+            int hash = 0;
+
+            outer: while (true) {
+                int accumulated = 0;
+                for (int i = 0; i < 4; i++) {
+                    nextByte = mmb.get(nextPos++);
+                    if (nextByte == ';') {
+                        if (i > 0) {
+                            hash = hash * 31 + accumulated;
+                        }
+                        break outer;
+                    }
+                    else {
+                        accumulated |= ((int) nextByte << (8 * i));
+                    }
+                }
+                hash = hash * 31 + accumulated;
+            }
+            int cityLen = nextPos - 1 - originalPos;
+
+            int temperature = 0;
+            boolean negative = mmb.get(nextPos) == '-';
+            while ((nextByte = mmb.get(nextPos++)) != '\n') {
+                if (nextByte != '-' && nextByte != '.') {
+                    temperature = temperature * 10 + (nextByte - '0');
+                }
+            }
+
+            linearProbe(
+                    cityLen, hash & slotsMask, negative ? -temperature : temperature, mmb, originalPos);
+
+            return nextPos;
+        }
+
         public AggregateResult result() {
             int N = stats.length;
-            TreeMap<String, Stat> map = new TreeMap<>();
+            Map<String, Stat> map = new LinkedHashMap<>(5_000);
             for (int i = 0; i < N; i++) {
                 if (stats[i] != null) {
                     map.put(new String(cityNames[i]), stats[i]);
@@ -624,6 +890,11 @@ private boolean equals(byte[] left, MappedByteBuffer right, int offsetInMmb, int
             return true;
         }
 
+        private boolean hasSemicolonByte(long value) {
+            long a = value ^ SEMICOLON_MASK;
+            return (((a - ONE_MASK) & ~a) & (0x8080808080808080L)) != 0;
+        }
+
         private void linearProbe(int len, int hash, int temp, MappedByteBuffer mmb, int offsetInMmb) {
             for (int i = hash;; i = (i + 1) & slotsMask) {
                 var curBytes = cityNames[i];
@@ -633,11 +904,6 @@ private void linearProbe(int len, int hash, int temp, MappedByteBuffer mmb, int
                     return;
                 }
                 else {
-                    // Overall, this tradeoff seems better than Arrays.equals(..)
-                    // City name param is encoded as (mmb, offsetnInMmb, len)
-                    // This avoids copying it into a (previously allocated) byte[]
-                    // The downside is that we have to manually implement 'equals' and it can lose out
-                    // to vectorized 'equals'; but the trade off seems to work in this particular case
                     if (len == curBytes.length && equals(curBytes, mmb, offsetInMmb, len)) {
                         stats[i].mergeReading(temp);
                         return;
@@ -695,4 +961,94 @@ public String toString() {
             return "%.1f/%.1f/%.1f".formatted(min / 10.0, sum / 10.0 / count, max / 10.0);
         }
     }
+
+    static class Tracing {
+
+        private static final long[] cleanerTimes = new long[1 << 6 << 1];
+        private static final long[] threadTimes = new long[1 << 6 << 1];
+        private static long startTime;
+
+        static void analyzeWorkThreads(String id, int nThreads) {
+            printTimingsAnalysis(id + " Stats", nThreads, timingsArray(id));
+        }
+
+        static void recordAppStart() {
+            startTime = System.nanoTime();
+        }
+
+        static void recordEvent(String event) {
+            printEvent(event, System.nanoTime());
+        }
+
+        static void recordWorkEnd(String id, int threadId) {
+            timingsArray(id)[2 * threadId + 1] = System.nanoTime();
+        }
+
+        static void recordWorkStart(String id, int threadId) {
+            timingsArray(id)[2 * threadId] = System.nanoTime();
+        }
+
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+
+        private static void errPrint(String message) {
+            System.err.println(message);
+        }
+
+    private static void printEvent(String message, long nanoTime) {
+      errPrint(STR."\{message} = \{(nanoTime - startTime) / 1_000_000}ms");
+    }
+
+    private static void printTimingsAnalysis(String header, int nThreads, long[] timestamps) {
+      long minDuration = Long.MAX_VALUE, maxDuration = Long.MIN_VALUE;
+      long minBegin = Long.MAX_VALUE, maxCompletion = Long.MIN_VALUE;
+      long maxBegin = Long.MIN_VALUE, minCompletion = Long.MAX_VALUE;
+
+      long[] durationsMs = new long[nThreads];
+      long[] completionsMs = new long[nThreads];
+      long[] beginMs = new long[nThreads];
+      for (int i = 0; i < nThreads; i++) {
+        long durationNs = timestamps[2 * i + 1] - timestamps[2 * i];
+        durationsMs[i] = durationNs / 1_000_000;
+        completionsMs[i] = (timestamps[2 * i + 1] - startTime) / 1_000_000;
+        beginMs[i] = (timestamps[2 * i] - startTime) / 1_000_000;
+
+        minDuration = Math.min(minDuration, durationNs);
+        maxDuration = Math.max(maxDuration, durationNs);
+
+        minBegin = Math.min(minBegin, timestamps[2 * i]);
+        maxBegin = Math.max(maxBegin, timestamps[2 * i]);
+
+        maxCompletion = Math.max(maxCompletion, timestamps[2 * i + 1]);
+        minCompletion = Math.min(minCompletion, timestamps[2 * i + 1]);
+      }
+      errPrint(
+          STR."""
+        -------------------------------------------------------------------------------------------
+                                       \{header}
+        -------------------------------------------------------------------------------------------
+        Max duration                              = \{maxDuration / 1_000_000} ms
+        Min duration                              = \{minDuration / 1_000_000} ms
+        Timespan[max(end)-min(start)]             = \{(maxCompletion - minBegin) / 1_000_000} ms
+        Completion Timespan[max(end)-min(end)]    = \{(maxCompletion - minCompletion) / 1_000_000} ms
+        Begin Timespan[max(begin)-min(begin)]     = \{(maxBegin - minBegin) / 1_000_000} ms
+        Durations                                 = \{toString(durationsMs)} in ms
+        Begin Timestamps                          = \{toString(beginMs)} in ms
+        Completion Timestamps                     = \{toString(completionsMs)} in ms
+        """);
+    }
+
+        private static long[] timingsArray(String id) {
+            return switch (id) {
+                case "cleaner" -> cleanerTimes;
+                case "shard" -> threadTimes;
+                default -> throw new RuntimeException("");
+            };
+        }
+
+        private static String toString(long[] array) {
+            return Arrays.stream(array)
+                    .mapToObj(x -> String.format("%6d", x))
+                    .collect(Collectors.joining(", ", "[ ", " ]"));
+        }
+    }
 }

From 4d7d9fb34e1b580d562d769c885acfcc9c46bae5 Mon Sep 17 00:00:00 2001
From: John Ziamos <iziamos@gmail.com>
Date: Wed, 17 Jan 2024 20:41:32 +0000
Subject: [PATCH 055/268] extract cursor interface (#458)

'pull' mery kitty number parsing code

try out tonne of flags (found via trial and error on my system)
---
 calculate_average_iziamos.sh                  |  8 +-
 .../onebrc/CalculateAverage_iziamos.java      | 89 +++++++++----------
 2 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/calculate_average_iziamos.sh b/calculate_average_iziamos.sh
index 7ce3ff1ad..755dddc77 100755
--- a/calculate_average_iziamos.sh
+++ b/calculate_average_iziamos.sh
@@ -15,5 +15,11 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xms16m -Xmx16m -XX:-AlwaysPreTouch -XX:-TieredCompilation -XX:CICompilerCount=1"
+JAVA_OPTS="--enable-preview
+  -XX:+UnlockExperimentalVMOptions \
+  -XX:+UseEpsilonGC -Xms16m -Xmx16m -XX:-AlwaysPreTouch \
+  -XX:-TieredCompilation -XX:CICompilerCount=1 -XX:CompilationMode=high-only \
+  -XX:C1MaxTrivialSize=500 -XX:-UseCountedLoopSafepoints -XX:+UseCMoveUnconditionally -XX:+DisableAttachMechanism \
+  -XX:-PreserveFramePointer -Xnoclassgc -disablesystemassertions -XX:-UsePerfData  \
+  -XX:-UseTransparentHugePages -XX:-UseCompressedOops"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_iziamos
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java b/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
index c0358b9c4..ad2bf052b 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
@@ -21,6 +21,7 @@
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.reflect.Field;
+import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -28,7 +29,6 @@
 import java.util.TreeMap;
 import java.util.concurrent.CompletableFuture;
 
-import static dev.morling.onebrc.CalculateAverage_iziamos.ByteBackedResultSet.mask;
 import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static java.nio.file.StandardOpenOption.READ;
@@ -61,6 +61,7 @@ public class CalculateAverage_iziamos {
         BASE_POINTER = WHOLE_FILE_SEGMENT.address();
         END_POINTER = BASE_POINTER + FILE_SIZE;
     }
+
     private static final long CHUNK_SIZE = 64 * 1024 * 1024;
     // private static final long CHUNK_SIZE = Long.MAX_VALUE;
 
@@ -141,7 +142,7 @@ private static long processEvents(final long start, final long limit) {
     }
 
     private static void scalarLoop(final long start, final long limit, final long result) {
-        final var cursor = new ScalarLoopCursor(start, limit);
+        final LoopCursor cursor = new ScalarLoopCursor(start, limit);
         while (cursor.hasMore()) {
             final long address = cursor.getCurrentAddress();
             final int length = cursor.getStringLength();
@@ -151,7 +152,19 @@ private static void scalarLoop(final long start, final long limit, final long re
         }
     }
 
-    public static class ScalarLoopCursor {
+    public interface LoopCursor {
+        long getCurrentAddress();
+
+        int getStringLength();
+
+        int getHash();
+
+        int getCurrentValue();
+
+        boolean hasMore();
+    }
+
+    public static class ScalarLoopCursor implements LoopCursor {
         private long pointer;
         private final long limit;
 
@@ -180,41 +193,35 @@ public int getStringLength() {
         }
 
         public int getHash() {
-            return mask(hash);
+            return hash;
         }
 
         public int getCurrentValue() {
-            final byte first = UNSAFE.getByte(pointer++);
-            final byte second = UNSAFE.getByte(pointer++);
-            final byte third = UNSAFE.getByte(pointer++);
-            final byte fourth = UNSAFE.getByte(pointer++);
-            final byte fifth = UNSAFE.getByte(pointer++);
-
-            int value;
-            if (second == '.') {
-                // D.D\n
-                value = appendDigit(digitCharToInt(first), third);
-                pointer--;
-                return value;
-            }
-            else if (fourth == '.') {
-                // -DD.D\n
-                value = digitCharToInt(second);
-                value = appendDigit(value, third);
-                value = -appendDigit(value, fifth);
-                pointer++;
-                return value;
-            }
-            else if (first == '-') {
-                // -D.D\n
-                return -appendDigit(digitCharToInt(second), fourth);
-            }
-            else {
-                // DD.D\n
-                value = digitCharToInt(first);
-                value = appendDigit(value, second);
-                return appendDigit(value, fourth);
+            return getCurrentValueMeryKitty();
+        }
+
+        /**
+         * No point rewriting what would essentially be the same code <3.
+         */
+        public int getCurrentValueMeryKitty() {
+            long word = UNSAFE.getLong(pointer);
+            if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) {
+                word = Long.reverseBytes(word);
             }
+
+            int decimalSepPos = Long.numberOfTrailingZeros(~word & 0x10101000);
+            int shift = 28 - decimalSepPos;
+
+            long signed = (~word << 59) >> 63;
+            long designMask = ~(signed & 0xFF);
+
+            long digits = ((word & designMask) << shift) & 0x0F000F0F00L;
+
+            long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            int increment = (decimalSepPos >>> 3) + 3;
+
+            pointer += increment;
+            return (int) ((absValue ^ signed) - signed);
         }
 
         public boolean hasMore() {
@@ -222,22 +229,12 @@ public boolean hasMore() {
         }
     }
 
-    private static int appendDigit(int value, final byte b) {
-        value *= 10;
-        value += digitCharToInt(b);
-        return value;
-    }
-
-    private static int digitCharToInt(final byte b) {
-        return b - '0';
-    }
-
     public interface ResultConsumer {
         void consume(final String name, final int min, final int max, final long sum, final long count);
     }
 
     static class ByteBackedResultSet {
-        private static final int MAP_SIZE = 16384;
+        private static final int MAP_SIZE = 16384 * 4;
         private static final int MASK = MAP_SIZE - 1;
         private static final long STRUCT_SIZE = 64;
         private static final long BYTE_SIZE = MAP_SIZE * STRUCT_SIZE;
@@ -338,7 +335,7 @@ private static int findSlot(final long baseAddress,
                                     final long otherStringAddress,
                                     final int otherStringLength) {
 
-            for (int slot = hash;; slot = mask(++slot)) {
+            for (int slot = mask(hash);; slot = mask(++slot)) {
                 final long structBase = baseAddress + ((long) slot * STRUCT_SIZE);
                 final long nameStart = UNSAFE.getLong(structBase);
                 if (nameStart == 0) {

From 4e445a4f564479a3116687d3fdfb55ade81a039d Mon Sep 17 00:00:00 2001
From: Juan Parera <1420988+jparera@users.noreply.github.com>
Date: Wed, 17 Jan 2024 21:52:33 +0100
Subject: [PATCH 056/268] jparera's initial implementation (#433)

* jparera's initial implementation

* Fixes bugs and improves performance for measurements3.txt

* Allows measurements.txt ending without a LF
---
 calculate_average_jparera.sh                  |  19 +
 .../onebrc/CalculateAverage_jparera.java      | 351 ++++++++++++++++++
 2 files changed, 370 insertions(+)
 create mode 100755 calculate_average_jparera.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java

diff --git a/calculate_average_jparera.sh b/calculate_average_jparera.sh
new file mode 100755
index 000000000..4c7a9e7d3
--- /dev/null
+++ b/calculate_average_jparera.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector -XX:-TieredCompilation"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jparera
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java
new file mode 100644
index 000000000..13252550a
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java
@@ -0,0 +1,351 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorSpecies;
+
+public class CalculateAverage_jparera {
+    private static final String FILE = "./measurements.txt";
+
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
+
+    private static final int BYTE_SPECIES_SIZE = BYTE_SPECIES.vectorByteSize();
+
+    private static final int BYTE_SPECIES_LANES = BYTE_SPECIES.length();
+
+    private static final ValueLayout.OfLong LONG_U_LE = ValueLayout.JAVA_LONG_UNALIGNED
+            .withOrder(ByteOrder.LITTLE_ENDIAN);
+
+    public static void main(String[] args) throws IOException {
+        try (var fc = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+            try (var arena = Arena.ofShared()) {
+                var fs = fc.map(MapMode.READ_ONLY, 0, fc.size(), arena);
+                var map = chunks(fs)
+                        .parallelStream()
+                        .map(Chunk::parse)
+                        .flatMap(List::stream)
+                        .collect(Collectors.toMap(
+                                Entry::key,
+                                Function.identity(),
+                                Entry::merge,
+                                TreeMap::new));
+                System.out.println(map);
+            }
+        }
+    }
+
+    private static Collection<Chunk> chunks(MemorySegment ms) {
+        var cpus = Runtime.getRuntime().availableProcessors();
+        long expectedChunkSize = Math.ceilDiv(ms.byteSize(), cpus);
+        var chunks = new ArrayList<Chunk>();
+        long fileSize = ms.byteSize();
+        long offset = 0;
+        while (offset < fileSize) {
+            var end = Math.min(offset + expectedChunkSize, fileSize);
+            while (end < fileSize && ms.get(ValueLayout.JAVA_BYTE, end++) != '\n') {
+            }
+            long len = end - offset;
+            chunks.add(new Chunk(ms.asSlice(offset, len)));
+            offset = end;
+        }
+        return chunks;
+    }
+
+    private static final class Chunk {
+        private static final byte SEPARATOR = ';';
+
+        private static final byte DECIMAL_SEPARATOR = '.';
+
+        private static final byte LF = '\n';
+
+        private static final byte MINUS = '-';
+
+        private static final int KEY_LOG2_BYTES = 7;
+
+        private static final int KEY_BYTES = 1 << KEY_LOG2_BYTES;
+
+        private static final int MAP_CAPACITY = 1 << 16;
+
+        private static final int BUCKET_MASK = MAP_CAPACITY - 1;
+
+        private final MemorySegment segment;
+
+        private final Entry[] entries = new Entry[MAP_CAPACITY];
+
+        private long offset;
+
+        private byte current;
+
+        private boolean hasCurrent = true;
+
+        Chunk(MemorySegment segment) {
+            this.segment = segment;
+        }
+
+        public List<Entry> parse() {
+            long size = this.segment.byteSize();
+            long safe = size - KEY_BYTES;
+            while (offset < safe) {
+                var e = vectorizedEntry();
+                int value = vectorizedValue();
+                e.add(value);
+            }
+            next();
+            while (hasCurrent()) {
+                var e = entry();
+                int value = value();
+                e.add(value);
+            }
+            var output = new ArrayList<Entry>(entries.length);
+            for (int i = 0; i < entries.length; i++) {
+                var e = entries[i];
+                if (e != null) {
+                    output.add(e);
+                }
+            }
+            return output;
+        }
+
+        private Entry vectorizedEntry() {
+            var start = this.offset;
+            var first = ByteVector.fromMemorySegment(BYTE_SPECIES, this.segment, start, ByteOrder.nativeOrder());
+            int equals = first.eq(SEPARATOR).firstTrue();
+            int len = equals;
+            for (int i = BYTE_SPECIES_SIZE; equals == BYTE_SPECIES_LANES; i += BYTE_SPECIES_SIZE) {
+                var next = ByteVector.fromMemorySegment(BYTE_SPECIES, this.segment, start + i, ByteOrder.nativeOrder());
+                equals = next.eq(SEPARATOR).firstTrue();
+                len += equals;
+            }
+            this.offset = start + len + 1;
+            int index = hash(this.segment, start, len);
+            int count = 0;
+            while (count < BUCKET_MASK) {
+                index = index & BUCKET_MASK;
+                var e = this.entries[index];
+                if (e == null) {
+                    return this.entries[index] = new Entry(len, this.segment.asSlice(start, KEY_BYTES));
+                }
+                else if (e.keyLength() == len && vectorizedEquals(e, first, start, len)) {
+                    return e;
+                }
+                index++;
+                count++;
+            }
+            throw new IllegalStateException("Map is full!");
+        }
+
+        private Entry entry() {
+            long start = this.offset - 1;
+            int len = 0;
+            while (hasCurrent() && current != SEPARATOR) {
+                len++;
+                next();
+            }
+            expect(SEPARATOR);
+            int index = hash(segment, start, len);
+            int count = 0;
+            while (count < BUCKET_MASK) {
+                index = index & BUCKET_MASK;
+                var e = this.entries[index];
+                if (e == null) {
+                    return this.entries[index] = new Entry(len, this.segment.asSlice(start, len));
+                }
+                else if (e.keyLength() == len && equals(e, start, len)) {
+                    return e;
+                }
+                index++;
+                count++;
+            }
+            throw new IllegalStateException("Map is full!");
+        }
+
+        private static final long MULTIPLY_ADD_DIGITS = 100 * (1L << 24) + 10 * (1L << 16) + 1;
+
+        private int vectorizedValue() {
+            long dw = this.segment.get(LONG_U_LE, this.offset);
+            boolean negative = ((dw & 0xFF) ^ MINUS) == 0;
+            int zeros = Long.numberOfTrailingZeros(~dw & 0x10101000L);
+            dw = ((negative ? (dw & ~0xFF) : dw) << (28 - zeros)) & 0x0F000F0F00L;
+            int value = (int) (((dw * MULTIPLY_ADD_DIGITS) >>> 32) & 0x3FF);
+            this.offset += (zeros >>> 3) + 3;
+            return negative ? -value : value;
+        }
+
+        private int value() {
+            int value = 0;
+            var negative = false;
+            if (consume(MINUS)) {
+                negative = true;
+            }
+            while (hasCurrent()) {
+                if ((current & 0xF0) == 0x30) {
+                    value *= 10;
+                    value += current - '0';
+                }
+                else if (current != DECIMAL_SEPARATOR) {
+                    break;
+                }
+                next();
+            }
+            if (hasCurrent()) {
+                expect(LF);
+            }
+            return negative ? -value : value;
+        }
+
+        private boolean vectorizedEquals(Entry entry, ByteVector okey, long offset, int len) {
+            var ekey = ByteVector.fromMemorySegment(BYTE_SPECIES, entry.segment(), 0, ByteOrder.nativeOrder());
+            int equals = ekey.eq(okey).not().firstTrue();
+            if (equals != BYTE_SPECIES_LANES) {
+                return equals >= len;
+            }
+            long eo = BYTE_SPECIES_SIZE;
+            int total = BYTE_SPECIES_LANES;
+            while (equals == BYTE_SPECIES_LANES & eo < KEY_BYTES) {
+                offset += BYTE_SPECIES_SIZE;
+                ekey = ByteVector.fromMemorySegment(BYTE_SPECIES, entry.segment(), eo, ByteOrder.nativeOrder());
+                okey = ByteVector.fromMemorySegment(BYTE_SPECIES, segment, offset, ByteOrder.nativeOrder());
+                equals = ekey.eq(okey).not().firstTrue();
+                total += equals;
+                eo += BYTE_SPECIES_SIZE;
+            }
+            return total >= len;
+        }
+
+        private boolean equals(Entry entry, long offset, int len) {
+            return MemorySegment.mismatch(this.segment, offset, offset + len, entry.segment(), 0, len) == -1;
+        }
+
+        private static final int GOLDEN_RATIO = 0x9E3779B9;
+        private static final int HASH_LROTATE = 5;
+
+        private static int hash(MemorySegment ms, long start, int len) {
+            int x, y;
+            if (len >= Integer.BYTES) {
+                x = ms.get(ValueLayout.JAVA_INT_UNALIGNED, start);
+                y = ms.get(ValueLayout.JAVA_INT_UNALIGNED, start + len - Integer.BYTES);
+            }
+            else {
+                x = ms.get(ValueLayout.JAVA_BYTE, start);
+                y = ms.get(ValueLayout.JAVA_BYTE, start + len - Byte.BYTES);
+            }
+            return (Integer.rotateLeft(x * GOLDEN_RATIO, HASH_LROTATE) ^ y) * GOLDEN_RATIO;
+        }
+
+        private void expect(byte b) {
+            if (!consume(b)) {
+                throw new IllegalStateException("Unexpected token!");
+            }
+        }
+
+        private boolean consume(byte b) {
+            if (current == b) {
+                next();
+                return true;
+            }
+            return false;
+        }
+
+        private boolean hasCurrent() {
+            return hasCurrent;
+        }
+
+        private void next() {
+            if (offset < segment.byteSize()) {
+                this.current = segment.get(ValueLayout.JAVA_BYTE, offset++);
+            }
+            else {
+                this.hasCurrent = false;
+            }
+        }
+    }
+
+    private static final class Entry {
+        private final int keyLength;
+
+        private final MemorySegment segment;
+
+        private int min = Integer.MAX_VALUE;
+
+        private int max = Integer.MIN_VALUE;
+
+        private long sum;
+
+        private int count;
+
+        Entry(int keyLength, MemorySegment segment) {
+            this.keyLength = keyLength;
+            this.segment = segment;
+        }
+
+        int keyLength() {
+            return keyLength;
+        }
+
+        MemorySegment segment() {
+            return segment;
+        }
+
+        public String key() {
+            return new String(segment.asSlice(0, keyLength).toArray(ValueLayout.JAVA_BYTE), StandardCharsets.UTF_8);
+        }
+
+        public void add(int value) {
+            min = Math.min(min, value);
+            max = Math.max(max, value);
+            sum += value;
+            count++;
+        }
+
+        public Entry merge(Entry o) {
+            min = Math.min(min, o.min);
+            max = Math.max(max, o.max);
+            sum += o.sum;
+            count += o.count;
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            var average = Math.round(((sum / 10.0) / count) * 10.0);
+            return decimal(min) + "/" + decimal(average) + "/" + decimal(max);
+        }
+
+        private static String decimal(long value) {
+            boolean negative = value < 0;
+            value = Math.abs(value);
+            return (negative ? "-" : "") + (value / 10) + "." + (value % 10);
+        }
+    }
+}

From 673c1b9f6e98cccc913fa72fb90d704cfd3d81f0 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 17 Jan 2024 22:00:38 +0100
Subject: [PATCH 057/268] Leaderboard update

---
 README.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 0aacc7bee..b86823432 100644
--- a/README.md
+++ b/README.md
@@ -46,22 +46,24 @@ These are the results from running all entries into the challenge on eight cores
 | 3* | 00:02.602 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
 |   | 00:02.692 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
 |   | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+|   | 00:02.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
-|   | 00:03.409 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
+|   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:04.726 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  | 
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
+|   | 00:05.089 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
-|   | 00:05.181 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:05.218 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
+|   | 00:05.175 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
-|   | 00:05.297 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-open | [zerninv](https://github.com/zerninv) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
+|   | 00:05.764 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
@@ -72,11 +74,11 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:06.911 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
-|   | 00:06.993 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
-|   | 00:07.809 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:07.913 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [parkertimmins](https://github.com/parkertimmins) |  |
+|   | 00:08.045 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
+|   | 00:08.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
 |   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
@@ -132,8 +134,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
+|   | 00:29.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java)| 21.0.1-open | [Matteo Vaccari](https://github.com/xpmatteo) |  |
 |   | 00:32.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_padreati.java)| 21.0.1-open | [Aurelian Tutuianu](https://github.com/padreati) |  |
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |
+|   | 00:35.875 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java)| 21.0.1-open | [MahmoudFawzyKhalil](https://github.com/MahmoudFawzyKhalil) |  |
 |   | 00:36.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hchiorean.java)| 21.0.1-open | [Horia Chiorean](https://github.com/hchiorean) |  |
 |   | 00:36.212 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:38.340 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_AbstractKamen.java)| 21.0.1-open | [AbstractKamen](https://github.com/AbstractKamen) |  |

From d0bdd335bdf5981e5946f0065a4218bf8e4e445c Mon Sep 17 00:00:00 2001
From: zerninv <zerninvasilii@yandex.ru>
Date: Fri, 19 Jan 2024 16:07:30 +0000
Subject: [PATCH 058/268] Last attempt CalculateAverage_zerninv (#480)

* use bits magic

* apply style
---
 .../onebrc/CalculateAverage_zerninv.java      | 138 +++++++++++-------
 1 file changed, 85 insertions(+), 53 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
index 42cf6b827..b28750f77 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
@@ -25,9 +25,7 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.TreeMap;
+import java.util.*;
 
 public class CalculateAverage_zerninv {
     private static final String FILE = "./measurements.txt";
@@ -55,10 +53,11 @@ public static void main(String[] args) throws IOException, InterruptedException
 
             var tasks = new TaskThread[CORES];
             for (int i = 0; i < tasks.length; i++) {
-                tasks[i] = new TaskThread(new MeasurementContainer(), (int) (fileSize / minChunkSize / CORES + 1));
+                tasks[i] = new TaskThread((int) (fileSize / minChunkSize / CORES + 1));
             }
 
-            var chunks = splitByChunks(segment.address(), segment.address() + fileSize, minChunkSize);
+            var results = new HashMap<String, TemperatureAggregation>();
+            var chunks = splitByChunks(segment.address(), segment.address() + fileSize, minChunkSize, results);
             for (int i = 0; i < chunks.size() - 1; i++) {
                 var task = tasks[i % tasks.length];
                 task.addChunk(chunks.get(i), chunks.get(i + 1));
@@ -68,19 +67,9 @@ public static void main(String[] args) throws IOException, InterruptedException
                 task.start();
             }
 
-            var results = new TreeMap<String, TemperatureAggregation>();
             for (var task : tasks) {
                 task.join();
-                task.measurements()
-                        .forEach(measurement -> {
-                            var aggr = results.get(measurement.station());
-                            if (aggr == null) {
-                                results.put(measurement.station(), measurement.aggregation());
-                            }
-                            else {
-                                aggr.merge(measurement.aggregation());
-                            }
-                        });
+                task.collectTo(results);
             }
 
             var bos = new BufferedOutputStream(System.out);
@@ -90,7 +79,31 @@ public static void main(String[] args) throws IOException, InterruptedException
         }
     }
 
-    private static List<Long> splitByChunks(long address, long end, long minChunkSize) {
+    private static List<Long> splitByChunks(long address, long end, long minChunkSize, Map<String, TemperatureAggregation> results) {
+        // handle last line
+        long offset = end - 1;
+        int temperature = 0;
+        byte b;
+        int multiplier = 1;
+        while ((b = UNSAFE.getByte(offset--)) != ';') {
+            if (b >= '0' && b <= '9') {
+                temperature += (b - '0') * multiplier;
+                multiplier *= 10;
+            }
+            else if (b == '-') {
+                temperature = -temperature;
+            }
+        }
+        long cityNameEnd = offset;
+        while (UNSAFE.getByte(offset - 1) != '\n' && offset > address) {
+            offset--;
+        }
+        var cityName = new byte[(int) (cityNameEnd - offset + 1)];
+        UNSAFE.copyMemory(null, offset, cityName, Unsafe.ARRAY_BYTE_BASE_OFFSET, cityName.length);
+        results.put(new String(cityName, StandardCharsets.UTF_8), new TemperatureAggregation(temperature, 1, (short) temperature, (short) temperature));
+
+        // split by chunks
+        end = offset;
         List<Long> result = new ArrayList<>((int) ((end - address) / minChunkSize + 1));
         result.add(address);
         while (address < end) {
@@ -115,14 +128,11 @@ public TemperatureAggregation(long sum, int count, short min, short max) {
             this.max = max;
         }
 
-        public void merge(TemperatureAggregation o) {
-            if (o == null) {
-                return;
-            }
-            sum += o.sum;
-            count += o.count;
-            min = min < o.min ? min : o.min;
-            max = max > o.max ? max : o.max;
+        public void merge(long sum, int count, short min, short max) {
+            this.sum += sum;
+            this.count += count;
+            this.min = this.min < min ? this.min : min;
+            this.max = this.max > max ? this.max : max;
         }
 
         @Override
@@ -131,9 +141,6 @@ public String toString() {
         }
     }
 
-    private record Measurement(String station, TemperatureAggregation aggregation) {
-    }
-
     private static final class MeasurementContainer {
         private static final int SIZE = 1 << 17;
 
@@ -190,23 +197,26 @@ public void put(long address, byte size, int hash, long lastBytes, short value)
             UNSAFE.putShort(ptr + MAX_OFFSET, value);
         }
 
-        public List<Measurement> measurements() {
-            var result = new ArrayList<Measurement>(1000);
+        public void collectTo(Map<String, TemperatureAggregation> results) {
             int count;
             for (int i = 0; i < SIZE; i++) {
                 long ptr = this.address + i * ENTRY_SIZE;
                 count = UNSAFE.getInt(ptr + COUNT_OFFSET);
                 if (count != 0) {
                     var station = createString(UNSAFE.getLong(ptr + ADDRESS_OFFSET), UNSAFE.getByte(ptr + SIZE_OFFSET));
-                    var measurements = new TemperatureAggregation(
-                            UNSAFE.getLong(ptr + SUM_OFFSET),
-                            count,
-                            UNSAFE.getShort(ptr + MIN_OFFSET),
-                            UNSAFE.getShort(ptr + MAX_OFFSET));
-                    result.add(new Measurement(station, measurements));
+                    var result = results.get(station);
+                    if (result == null) {
+                        results.put(station, new TemperatureAggregation(
+                                UNSAFE.getLong(ptr + SUM_OFFSET),
+                                count,
+                                UNSAFE.getShort(ptr + MIN_OFFSET),
+                                UNSAFE.getShort(ptr + MAX_OFFSET)));
+                    }
+                    else {
+                        result.merge(UNSAFE.getLong(ptr + SUM_OFFSET), count, UNSAFE.getShort(ptr + MIN_OFFSET), UNSAFE.getShort(ptr + MAX_OFFSET));
+                    }
                 }
             }
-            return result;
         }
 
         private boolean isEqual(long address, long address2, int size) {
@@ -237,14 +247,25 @@ private static class TaskThread extends Thread {
         private static final int BYTE_MASK = 0xff;
 
         private static final int ZERO = '0';
-        private static final byte DELIMITER = ';';
+        private static final long DELIMITER_MASK = 0x3b3b3b3b3b3b3b3bL;
+        private static final long[] SIGNIFICANT_BYTES_MASK = {
+                0,
+                0xff,
+                0xffff,
+                0xffffff,
+                0xffffffffL,
+                0xffffffffffL,
+                0xffffffffffffL,
+                0xffffffffffffffL,
+                0xffffffffffffffffL
+        };
 
         private final MeasurementContainer container;
         private final List<Long> begins;
         private final List<Long> ends;
 
-        private TaskThread(MeasurementContainer container, int chunks) {
-            this.container = container;
+        private TaskThread(int chunks) {
+            this.container = new MeasurementContainer();
             this.begins = new ArrayList<>(chunks);
             this.ends = new ArrayList<>(chunks);
         }
@@ -261,26 +282,33 @@ public void run() {
             }
         }
 
-        public List<Measurement> measurements() {
-            return container.measurements();
-        }
-
         private void calcForChunk(long offset, long end) {
-            long cityOffset, lastBytes;
-            int hashCode, temperature, word;
-            byte cityNameSize, b;
+            long cityOffset, lastBytes, city, masked, hashCode;
+            int temperature, word, delimiterIdx;
+            byte cityNameSize;
 
             while (offset < end) {
                 cityOffset = offset;
                 lastBytes = 0;
                 hashCode = 0;
-                while ((b = UNSAFE.getByte(offset++)) != DELIMITER) {
-                    hashCode += hashCode * 31 + b;
-                    lastBytes = (lastBytes << 8) | b;
+                delimiterIdx = 8;
+
+                while (delimiterIdx == 8) {
+                    city = UNSAFE.getLong(offset);
+                    masked = city ^ DELIMITER_MASK;
+                    masked = (masked - 0x0101010101010101L) & ~masked & 0x8080808080808080L;
+                    delimiterIdx = Long.numberOfTrailingZeros(masked) >>> 3;
+                    if (delimiterIdx == 0) {
+                        break;
+                    }
+                    offset += delimiterIdx;
+                    lastBytes = city & SIGNIFICANT_BYTES_MASK[delimiterIdx];
+                    hashCode = ((hashCode >>> 5) ^ lastBytes) * 0x517cc1b727220a95L;
                 }
-                cityNameSize = (byte) (offset - cityOffset - 1);
 
-                word = UNSAFE.getInt(offset);
+                cityNameSize = (byte) (offset - cityOffset);
+
+                word = UNSAFE.getInt(++offset);
                 offset += 4;
 
                 if ((word & TWO_NEGATIVE_DIGITS_MASK) == TWO_NEGATIVE_DIGITS_MASK) {
@@ -300,8 +328,12 @@ else if ((word & TWO_DIGITS_MASK) == TWO_DIGITS_MASK) {
                     temperature = ZERO * 111 - ((word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK));
                 }
                 offset++;
-                container.put(cityOffset, cityNameSize, hashCode, lastBytes, (short) temperature);
+                container.put(cityOffset, cityNameSize, Long.hashCode(hashCode), lastBytes, (short) temperature);
             }
         }
+
+        public void collectTo(Map<String, TemperatureAggregation> results) {
+            container.collectTo(results);
+        }
     }
 }

From 9b28dd2aec10e6dfac0223df3a3dc16a10ce1528 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jairo=20Grater=C3=B3n?=
 <58091322+jgrateron@users.noreply.github.com>
Date: Fri, 19 Jan 2024 12:12:05 -0400
Subject: [PATCH 059/268] fix test rounding, pass 10K station names (#471)

---
 .../onebrc/CalculateAverage_jgrateron.java    | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java
index 488650853..fa93167c2 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java
@@ -20,17 +20,16 @@
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.Map.Entry;
+import java.util.TreeMap;
 import java.util.stream.Collectors;
 
 public class CalculateAverage_jgrateron {
     private static final String FILE = "./measurements.txt";
-    private static final int MAX_LENGTH_LINE = 115;
+    private static final int MAX_LENGTH_LINE = 255;
     private static final int MAX_BUFFER = 1024 * 8;
     private static boolean DEBUG = false;
 
@@ -93,7 +92,7 @@ public static void main(String[] args) throws InterruptedException, IOException
         Locale.setDefault(Locale.US);
         var startTime = System.nanoTime();
         var archivo = new File(FILE);
-        var totalMediciones = new HashMap<String, Medicion>();
+        var totalMediciones = new TreeMap<String, Medicion>();
         var tareas = new ArrayList<Thread>();
         var particiones = dividirArchivo(archivo);
 
@@ -124,11 +123,7 @@ public static void main(String[] args) throws InterruptedException, IOException
             hilo.join();
         }
 
-        Comparator<Entry<String, Medicion>> comparar = (a, b) -> {
-            return a.getKey().compareTo(b.getKey());
-        };
         var result = totalMediciones.entrySet().stream()//
-                .sorted(comparar)//
                 .map(e -> e.getKey() + "=" + e.getValue().toString())//
                 .collect(Collectors.joining(", "));
 
@@ -276,7 +271,7 @@ public int buscarSemicolon(byte data[], int len) {
          */
         public void updateMediciones(byte data[], int pos, int semicolon) {
             var hashEstacion = calcHashCode(0, data, pos, semicolon);
-            var temp = strToDouble(data, pos, semicolon);
+            var temp = strToInt(data, pos, semicolon);
             index.setHash(hashEstacion);
             var estacion = estaciones.get(index);
             if (estacion == null) {
@@ -321,11 +316,11 @@ private int calcHashCode(int result, byte[] a, int fromIndex, int length) {
         /*
          * convierte de un arreglo de bytes a double
          */
-        public double strToDouble(byte linea[], int idx, int posSeparator) {
-            double number = 0;
+        public int strToInt(byte linea[], int idx, int posSeparator) {
+            int number = 0;
             int pos = idx + posSeparator + 1;
-            int esNegativo = linea[pos] == '-' ? -1 : 1;
-            if (esNegativo == -1) {
+            boolean esNegativo = linea[pos] == '-';
+            if (esNegativo) {
                 pos++;
             }
             int digit1 = linea[pos] - 48;
@@ -339,7 +334,7 @@ public double strToDouble(byte linea[], int idx, int posSeparator) {
                 pos += 2;
                 number = (digit1 * 100) + (digit2 * 10) + (linea[pos] - 48);
             }
-            return number / 10 * esNegativo;
+            return esNegativo ? -number : number;
         }
     }
 
@@ -348,11 +343,11 @@ public double strToDouble(byte linea[], int idx, int posSeparator) {
      */
     static class Medicion {
         private int count;
-        private double tempMin;
-        private double tempMax;
-        private double tempSum;
+        private int tempMin;
+        private int tempMax;
+        private int tempSum;
 
-        public Medicion(int count, double tempMin, double tempMax, double tempSum) {
+        public Medicion(int count, int tempMin, int tempMax, int tempSum) {
             super();
             this.count = count;
             this.tempMin = tempMin;
@@ -360,7 +355,7 @@ public Medicion(int count, double tempMin, double tempMax, double tempSum) {
             this.tempSum = tempSum;
         }
 
-        public void update(int count, double tempMin, double tempMax, double tempSum) {
+        public void update(int count, int tempMin, int tempMax, int tempSum) {
             this.count += count;
             if (tempMin < this.tempMin) {
                 this.tempMin = tempMin;
@@ -371,11 +366,16 @@ public void update(int count, double tempMin, double tempMax, double tempSum) {
             this.tempSum += tempSum;
         }
 
+        public double round(double number) {
+            return Math.round(number) / 10.0;
+        }
+
         @Override
         public String toString() {
-            double tempPro = (double) tempSum;
-            tempPro = tempPro / count;
-            return "%.1f/%.1f/%.1f".formatted(tempMin, tempPro, tempMax);
+            var min = round(tempMin);
+            var mid = round(1.0 * tempSum / count);
+            var max = round(tempMax);
+            return "%.1f/%.1f/%.1f".formatted(min, mid, max);
         }
     }
 }

From fefe326a143102089afd3c47e0434d5d9e4299f4 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <ianopolous@protonmail.com>
Date: Fri, 19 Jan 2024 16:14:45 +0000
Subject: [PATCH 060/268] 3s (16%) faster, still no unsafe (#478)

* use Arena and MemorySegment to map entire file at once
* reduced branches and instructions
---
 .../CalculateAverage_ianopolousfast.java      | 324 +++++++++++-------
 1 file changed, 199 insertions(+), 125 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index a8c4e4cd1..4bffe7839 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -15,45 +15,53 @@
  */
 package dev.morling.onebrc;
 
-import java.io.*;
-import java.nio.*;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.nio.ByteOrder;
 import java.nio.channels.*;
-import java.util.concurrent.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
 import java.util.stream.*;
 import java.util.*;
 
+import static java.lang.foreign.ValueLayout.*;
+
 /* A fast implementation with no unsafe.
  * Features:
- * * memory mapped file
+ * * memory mapped file using preview Arena FFI
  * * read chunks in parallel
  * * minimise allocation
  * * no unsafe
  *
  * Timings on 4 core i7-7500U CPU @ 2.70GHz:
  * average_baseline: 4m48s
- * ianopolous:         19s
+ * ianopolous:         16s
 */
 public class CalculateAverage_ianopolousfast {
 
     public static final int MAX_LINE_LENGTH = 107;
-    public static final int MAX_STATIONS = 10_000;
+    public static final int MAX_STATIONS = 1 << 14;
+    private static final OfLong LONG_LAYOUT = JAVA_LONG_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN);
 
     public static void main(String[] args) throws Exception {
-        File input = new File("./measurements.txt");
-        long filesize = input.length();
-        // keep chunk size between 256 MB and 1G (1 chunk for files < 256MB)
-        long chunkSize = Math.min(Math.max((filesize + 31) / 32, 256 * 1024 * 1024), 1024 * 1024 * 1024L);
-        int nChunks = (int) ((filesize + chunkSize - 1) / chunkSize);
-        ExecutorService pool = Executors.newVirtualThreadPerTaskExecutor();
-        List<Future<List<List<Stat>>>> allResults = IntStream.range(0, nChunks)
-                .mapToObj(i -> pool.submit(() -> parseStats(i * chunkSize, Math.min((i + 1) * chunkSize, filesize))))
+        Arena arena = Arena.global();
+        Path input = Path.of("measurements.txt");
+        FileChannel channel = (FileChannel) Files.newByteChannel(input, StandardOpenOption.READ);
+        long filesize = Files.size(input);
+        MemorySegment mmap = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize, arena);
+        int nChunks = filesize < 4 * 1024 * 1024 ? 1 : Runtime.getRuntime().availableProcessors();
+        long chunkSize = (filesize + nChunks - 1) / nChunks;
+        List<List<List<Stat>>> allResults = IntStream.range(0, nChunks)
+                .parallel()
+                .mapToObj(i -> parseStats(i * chunkSize, Math.min((i + 1) * chunkSize, filesize), mmap))
                 .toList();
 
         TreeMap<String, Stat> merged = allResults.stream()
                 .parallel()
                 .flatMap(f -> {
                     try {
-                        return f.get().stream().filter(Objects::nonNull).flatMap(Collection::stream);
+                        return f.stream().filter(Objects::nonNull).flatMap(Collection::stream);
                     }
                     catch (Exception e) {
                         e.printStackTrace();
@@ -64,25 +72,39 @@ public static void main(String[] args) throws Exception {
         System.out.println(merged);
     }
 
-    public static boolean matchingStationBytes(int start, int end, ByteBuffer buffer, Stat existing) {
-        if (end - start != existing.name.length)
+    public static boolean matchingStationBytes(long start, long end, int offset, MemorySegment buffer, Stat existing) {
+        int len = (int) (end - start);
+        if (len != existing.name.length)
             return false;
-        for (int i = start; i < end; i++) {
-            if (existing.name[i - start] != buffer.get(i))
+        for (int i = offset; i < len; i++) {
+            if (existing.name[i] != buffer.get(JAVA_BYTE, offset + start++))
                 return false;
         }
         return true;
     }
 
-    public static Stat dedupeStation(int start, int end, long hash, ByteBuffer buffer, List<List<Stat>> stations) {
-        int index = Math.floorMod(hash ^ (hash >> 32), MAX_STATIONS);
+    private static int hashToIndex(long hash, int len) {
+        // From Thomas Wuerthinger's entry
+        int hashAsInt = (int) (hash ^ (hash >>> 28));
+        int finalHash = (hashAsInt ^ (hashAsInt >>> 15));
+        return (finalHash & (len - 1));
+    }
+
+    public static Stat parseStation(long start, long end, long first8, long second8,
+                                    MemorySegment buffer) {
+        byte[] stationBuffer = new byte[(int) (end - start)];
+        for (long off = start; off < end; off++)
+            stationBuffer[(int) (off - start)] = buffer.get(JAVA_BYTE, off);
+        return new Stat(stationBuffer, first8, second8);
+    }
+
+    public static Stat dedupeStation(long start, long end, long hash, long first8, long second8,
+                                     MemorySegment buffer, List<List<Stat>> stations) {
+        int index = hashToIndex(hash, MAX_STATIONS);
         List<Stat> matches = stations.get(index);
         if (matches == null) {
             List<Stat> value = new ArrayList<>();
-            byte[] stationBuffer = new byte[end - start];
-            buffer.position(start);
-            buffer.get(stationBuffer);
-            Stat res = new Stat(stationBuffer);
+            Stat res = parseStation(start, end, first8, second8, buffer);
             value.add(res);
             stations.set(index, value);
             return res;
@@ -90,136 +112,185 @@ public static Stat dedupeStation(int start, int end, long hash, ByteBuffer buffe
         else {
             for (int i = 0; i < matches.size(); i++) {
                 Stat s = matches.get(i);
-                if (matchingStationBytes(start, end, buffer, s))
+                if (first8 == s.first8 && second8 == s.second8 && matchingStationBytes(start, end, 16, buffer, s))
                     return s;
             }
-            byte[] stationBuffer = new byte[end - start];
-            buffer.position(start);
-            buffer.get(stationBuffer);
-            Stat res = new Stat(stationBuffer);
+            Stat res = parseStation(start, end, first8, second8, buffer);
             matches.add(res);
             return res;
         }
     }
 
-    public static int getSemicolon(long d) {
+    public static Stat dedupeStation8(long start, long end, long hash, long first8, MemorySegment buffer, List<List<Stat>> stations) {
+        int index = hashToIndex(hash, MAX_STATIONS);
+        List<Stat> matches = stations.get(index);
+        if (matches == null) {
+            List<Stat> value = new ArrayList<>();
+            Stat station = parseStation(start, end, first8, 0, buffer);
+            value.add(station);
+            stations.set(index, value);
+            return station;
+        }
+        else {
+            for (int i = 0; i < matches.size(); i++) {
+                Stat s = matches.get(i);
+                if (first8 == s.first8 && s.name.length <= 8)
+                    return s;
+            }
+            Stat station = parseStation(start, end, first8, 0, buffer);
+            matches.add(station);
+            return station;
+        }
+    }
+
+    public static Stat dedupeStation16(long start, long end, long hash, long first8, long second8, MemorySegment buffer, List<List<Stat>> stations) {
+        int index = hashToIndex(hash, MAX_STATIONS);
+        List<Stat> matches = stations.get(index);
+        if (matches == null) {
+            List<Stat> value = new ArrayList<>();
+            Stat res = parseStation(start, end, first8, second8, buffer);
+            value.add(res);
+            stations.set(index, value);
+            return res;
+        }
+        else {
+            for (int i = 0; i < matches.size(); i++) {
+                Stat s = matches.get(i);
+                if (first8 == s.first8 && second8 == s.second8 && s.name.length <= 16)
+                    return s;
+            }
+            Stat res = parseStation(start, end, first8, second8, buffer);
+            matches.add(res);
+            return res;
+        }
+    }
+
+    public static long hasSemicolon(long d) {
         // from Hacker's Delight page 92
         d = d ^ 0x3b3b3b3b3b3b3b3bL;
         long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
-        y = ~(y | d | 0x7f7f7f7f7f7f7f7fL);
+        return ~(y | d | 0x7f7f7f7f7f7f7f7fL);
+    }
+
+    public static int getSemicolonIndex(long y) {
+        // from Hacker's Delight page 92
         return Long.numberOfLeadingZeros(y) >> 3;
     }
 
-    public static long updateHash(long hash, long x) {
-        return ((hash << 5) ^ x) * 0x517cc1b727220a95L; // fxHash
+    static long maskHighBytes(long d, int nbytes) {
+        return d & (-1L << ((8 - nbytes) * 8));
     }
 
-    public static Stat parseStation(int lineStart, ByteBuffer buffer, List<List<Stat>> stations) {
+    public static Stat parseStation(long lineStart, MemorySegment buffer, List<List<Stat>> stations) {
         // find semicolon and update hash as we go, reading a long at a time
-        long d = buffer.getLong(lineStart);
+        long d = buffer.get(LONG_LAYOUT, lineStart);
+        long hasSemi = hasSemicolon(d);
+        if (hasSemi != 0) {
+            int semiIndex = getSemicolonIndex(hasSemi);
+            d = maskHighBytes(d, semiIndex);
+            return dedupeStation8(lineStart, lineStart + semiIndex, d, d, buffer, stations);
+        }
+        long first8 = d;
+        long hash = d;
+
+        d = buffer.get(LONG_LAYOUT, lineStart + 8);
+        hasSemi = hasSemicolon(d);
+        if (hasSemi != 0) {
+            int semiIndex = getSemicolonIndex(hasSemi);
+            if (semiIndex == 0)
+                return dedupeStation8(lineStart, lineStart + 8, first8, first8, buffer, stations);
+            d = maskHighBytes(d, semiIndex);
+            return dedupeStation16(lineStart, lineStart + 8 + semiIndex, first8 ^ d, first8, d, buffer, stations);
+        }
 
-        int semiIndex = getSemicolon(d);
-        int index = 0;
-        long hash = 0;
-        while (semiIndex == 8) {
-            hash = updateHash(hash, d);
+        int index = 8;
+        long second8 = d;
+        while (hasSemi == 0) {
+            hash = hash ^ d;
             index += 8;
-            d = buffer.getLong(lineStart + index);
-            semiIndex = getSemicolon(d);
+            d = buffer.get(LONG_LAYOUT, lineStart + index);
+            hasSemi = hasSemicolon(d);
         }
-        // mask extra bytes off last long
-        d = d & (-1L << ((8 - semiIndex) * 8));
+        int semiIndex = getSemicolonIndex(hasSemi);
+        d = maskHighBytes(d, semiIndex);
         if (semiIndex > 0) {
-            hash = updateHash(hash, d);
+            hash = hash ^ d;
         }
-        return dedupeStation(lineStart, lineStart + index + semiIndex, hash, buffer, stations);
+        return dedupeStation(lineStart, lineStart + index + semiIndex, hash, first8, second8, buffer, stations);
     }
 
-    public static int processTemperature(int lineSplit, MappedByteBuffer buffer, Stat station) {
-        short temperature;
-        boolean negative = false;
-        byte b = buffer.get(lineSplit++);
-        if (b == '-') {
-            negative = true;
-            b = buffer.get(lineSplit++);
-        }
-        temperature = (short) (b - 0x30);
-        b = buffer.get(lineSplit++);
-        if (b == '.') {
-            b = buffer.get(lineSplit++);
-            temperature = (short) (temperature * 10 + (b - 0x30));
-        }
-        else {
-            temperature = (short) (temperature * 10 + (b - 0x30));
-            lineSplit++;
-            b = buffer.get(lineSplit++);
-            temperature = (short) (temperature * 10 + (b - 0x30));
-        }
-        temperature = negative ? (short) -temperature : temperature;
+    public static int getDot(long d) {
+        // from Hacker's Delight page 92
+        d = d ^ 0x2e2e2e2e2e2e2e2eL;
+        long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
+        y = ~(y | d | 0x7f7f7f7f7f7f7f7fL);
+        return Long.numberOfLeadingZeros(y) >> 3;
+    }
+
+    public static short getMinus(long d) {
+        d = d & 0xff00000000000000L;
+        d = d ^ 0x2d2d2d2d2d2d2d2dL;
+        long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
+        y = ~(y | d | 0x7f7f7f7f7f7f7f7fL);
+        return (short) ((Long.numberOfLeadingZeros(y) >> 6) - 1);
+    }
+
+    public static long processTemperature(long lineSplit, MemorySegment buffer, Stat station) {
+        long d = buffer.get(LONG_LAYOUT, lineSplit);
+        // negative is either 0 or -1
+        short negative = getMinus(d);
+        d = d << (negative * -8);
+        int dotIndex = getDot(d);
+        d = (d >> 8) | 0x30000000_00000000L; // add a leading 0 digit
+        d = d >> 8 * (5 - dotIndex);
+        short temperature = (short) ((byte) d - '0' +
+                10 * (((byte) (d >> 16)) - '0') +
+                100 * (((byte) (d >> 24)) - '0'));
+        temperature = (short) ((temperature ^ negative) - negative); // negative treatment inspired by merkitty
         station.add(temperature);
-        return lineSplit + 1;
+        return lineSplit - negative + dotIndex + 3;
     }
 
-    public static List<List<Stat>> parseStats(long startByte, long endByte) {
-        try {
-            RandomAccessFile file = new RandomAccessFile("./measurements.txt", "r");
-            long maxEnd = Math.min(file.length(), endByte + MAX_LINE_LENGTH);
-            long len = maxEnd - startByte;
-            if (len > Integer.MAX_VALUE)
-                throw new RuntimeException("Segment size must fit into an int");
-            int maxDone = (int) (endByte - startByte);
-            MappedByteBuffer buffer = file.getChannel().map(FileChannel.MapMode.READ_ONLY, startByte, len);
-            int done = 0;
-            // read first partial line
-            if (startByte > 0) {
-                for (int i = 0; i < MAX_LINE_LENGTH; i++) {
-                    byte b = buffer.get(i);
-                    if (b == '\n') {
-                        done = i + 1;
-                        break;
-                    }
+    public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySegment buffer) {
+        // read first partial line
+        if (startByte > 0) {
+            for (int i = 0; i < MAX_LINE_LENGTH; i++) {
+                byte b = buffer.get(JAVA_BYTE, startByte++);
+                if (b == '\n') {
+                    break;
                 }
             }
+        }
 
-            List<List<Stat>> stations = new ArrayList<>(MAX_STATIONS);
-            for (int i = 0; i < MAX_STATIONS; i++)
-                stations.add(null);
-
-            // Handle reading the very last line in the file
-            // this allows us to not worry about reading a long beyond the end
-            // in the inner loop (reducing branches)
-            // We only need to read one because the min record size is 6 bytes
-            // so 2nd last record must be > 8 from end
-            if (endByte == file.length()) {
-                int offset = (int) (file.length() - startByte - 1);
-                while (buffer.get(offset) != '\n') // final new line
-                    offset--;
-                offset--;
-                while (offset > 0 && buffer.get(offset) != '\n') // end of second last line
-                    offset--;
-                maxDone = offset;
-                if (offset > 0)
-                    offset++;
-                // copy into a 8n sized buffer to avoid reading off end
-                int roundedSize = (int) (file.length() - startByte) - offset;
-                roundedSize = (roundedSize + 7) / 8 * 8;
-                byte[] end = new byte[roundedSize];
-                for (int i = offset; i < (int) (file.length() - startByte); i++)
-                    end[i - offset] = buffer.get(i);
-                Stat station = parseStation(0, ByteBuffer.wrap(end), stations);
-                processTemperature(offset + station.name.length + 1, buffer, station);
-            }
+        List<List<Stat>> stations = new ArrayList<>(MAX_STATIONS);
+        for (int i = 0; i < MAX_STATIONS; i++)
+            stations.add(null);
 
-            int lineStart = done;
-            while (lineStart < maxDone) {
-                Stat station = parseStation(lineStart, buffer, stations);
-                lineStart = processTemperature(lineStart + station.name.length + 1, buffer, station);
-            }
-            return stations;
+        // Handle reading the very last line in the file
+        // this allows us to not worry about reading a long beyond the end
+        // in the inner loop (reducing branches)
+        // We only need to read one because the min record size is 6 bytes
+        // so 2nd last record must be > 8 from end
+        if (endByte == buffer.byteSize()) {
+            endByte -= 2; // skip final new line
+            while (endByte > 0 && buffer.get(JAVA_BYTE, endByte) != '\n')
+                endByte--;
+
+            if (endByte > 0)
+                endByte++;
+            // copy into a 8n sized buffer to avoid reading off end
+            MemorySegment end = Arena.global().allocate(MAX_LINE_LENGTH + 4);
+            for (long i = endByte; i < buffer.byteSize(); i++)
+                end.set(JAVA_BYTE, i - endByte, buffer.get(JAVA_BYTE, i));
+            Stat station = parseStation(0, end, stations);
+            processTemperature(station.name.length + 1, end, station);
         }
-        catch (IOException e) {
-            throw new RuntimeException(e);
+
+        while (startByte < endByte) {
+            Stat station = parseStation(startByte, buffer, stations);
+            startByte = processTemperature(startByte + station.name.length + 1, buffer, station);
         }
+        return stations;
     }
 
     public static class Stat {
@@ -227,9 +298,12 @@ public static class Stat {
         int count = 0;
         short min = Short.MAX_VALUE, max = Short.MIN_VALUE;
         long total = 0;
+        final long first8, second8;
 
-        public Stat(byte[] name) {
+        public Stat(byte[] name, long first8, long second8) {
             this.name = name;
+            this.first8 = first8;
+            this.second8 = second8;
         }
 
         public void add(short value) {
@@ -263,4 +337,4 @@ public String toString() {
             return round((double) min) + "/" + round(((double) total) / count) + "/" + round((double) max);
         }
     }
-}
+}
\ No newline at end of file

From ec27a47ce1226a653ab205db846e2ca84171bf5f Mon Sep 17 00:00:00 2001
From: Roman Musin <995612+roman-r-m@users.noreply.github.com>
Date: Fri, 19 Jan 2024 16:20:57 +0000
Subject: [PATCH 061/268] Version 4 - roman-r-m (#484)

* Version 3

* trying to optimize memory access (-0.2s)

- use smaller segments confined to thread
- unload in parallel

* Only call MemorySegment.address() once (~200ms)
---
 .../onebrc/CalculateAverage_roman_r_m.java    | 64 +++++++++++--------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
index 2efb46120..5c4382458 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
@@ -71,10 +71,15 @@ static class Worker {
         private final long end;
         private long offset;
 
-        public Worker(MemorySegment ms, long start, long end) {
-            this.ms = ms.asSlice(start, end - start);
-            this.offset = 0;
-            this.end = end - start;
+        public Worker(FileChannel channel, long start, long end) {
+            try {
+                this.ms = channel.map(FileChannel.MapMode.READ_ONLY, start, end - start, Arena.ofConfined());
+                this.offset = ms.address();
+                this.end = ms.address() + end - start;
+            }
+            catch (Exception e) {
+                throw new RuntimeException(e);
+            }
         }
 
         private void parseName(ByteString station) {
@@ -82,7 +87,7 @@ private void parseName(ByteString station) {
             long pos = -1;
 
             while (end - offset > 8) {
-                long next = UNSAFE.getLong(ms.address() + offset);
+                long next = UNSAFE.getLong(offset);
                 pos = find(next, SEMICOLON_MASK);
                 if (pos >= 0) {
                     offset += pos;
@@ -93,7 +98,7 @@ private void parseName(ByteString station) {
                 }
             }
             if (pos < 0) {
-                while (UNSAFE.getByte(ms.address() + offset++) != ';') {
+                while (UNSAFE.getByte(offset++) != ';') {
                 }
                 offset--;
             }
@@ -107,7 +112,7 @@ private void parseName(ByteString station) {
         }
 
         long parseNumberFast() {
-            long encodedVal = UNSAFE.getLong(ms.address() + offset);
+            long encodedVal = UNSAFE.getLong(offset);
 
             var len = find(encodedVal, LINE_END_MASK);
             offset += len + 1;
@@ -127,12 +132,12 @@ long parseNumberFast() {
         }
 
         long parseNumberSlow() {
-            long val = UNSAFE.getByte(ms.address() + offset++) - '0';
+            long val = UNSAFE.getByte(offset++) - '0';
             byte b;
-            while ((b = UNSAFE.getByte(ms.address() + offset++)) != '.') {
+            while ((b = UNSAFE.getByte(offset++)) != '.') {
                 val = val * 10 + (b - '0');
             }
-            b = UNSAFE.getByte(ms.address() + offset);
+            b = UNSAFE.getByte(offset);
             val = val * 10 + (b - '0');
             offset += 2;
             return val;
@@ -140,7 +145,7 @@ long parseNumberSlow() {
 
         long parseNumber() {
             long val;
-            int neg = 1 - Integer.bitCount(UNSAFE.getByte(ms.address() + offset) & 0x10);
+            int neg = 1 - Integer.bitCount(UNSAFE.getByte(offset) & 0x10);
             offset += neg;
 
             if (end - offset > 8) {
@@ -178,18 +183,27 @@ public static void main(String[] args) throws Exception {
         long fileSize = new File(FILE).length();
 
         var channel = FileChannel.open(Paths.get(FILE));
-        MemorySegment ms = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.ofAuto());
+        MemorySegment ms = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.ofConfined());
 
         int numThreads = fileSize > Integer.MAX_VALUE ? Runtime.getRuntime().availableProcessors() : 1;
         long chunk = fileSize / numThreads;
 
+        var bounds = IntStream.range(0, numThreads).mapToLong(i -> {
+            boolean lastChunk = i == numThreads - 1;
+            return lastChunk ? fileSize : nextNewline((i + 1) * chunk, ms);
+        }).toArray();
+
+        ms.unload();
+
         var result = IntStream.range(0, numThreads)
                 .parallel()
                 .mapToObj(i -> {
-                    boolean lastChunk = i == numThreads - 1;
-                    long chunkStart = i == 0 ? 0 : nextNewline(i * chunk, ms) + 1;
-                    long chunkEnd = lastChunk ? fileSize : nextNewline((i + 1) * chunk, ms);
-                    return new Worker(ms, chunkStart, chunkEnd).run();
+                    long start = i == 0 ? 0 : bounds[i - 1] + 1;
+                    long end = bounds[i];
+                    Worker worker = new Worker(channel, start, end);
+                    var res = worker.run();
+                    worker.ms.unload();
+                    return res;
                 }).reduce((m1, m2) -> {
                     m2.forEach((k, v) -> m1.merge(k, v, ResultRow::merge));
                     return m1;
@@ -212,7 +226,7 @@ static final class ByteString {
         @Override
         public String toString() {
             var bytes = new byte[len];
-            UNSAFE.copyMemory(null, ms.address() + offset, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, len);
+            UNSAFE.copyMemory(null, offset, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, len);
             return new String(bytes, 0, len);
         }
 
@@ -238,23 +252,21 @@ public boolean equals(Object o) {
 
             int i = 0;
 
-            long base1 = ms.address() + offset;
-            long base2 = ms.address() + that.offset;
             for (; i + 7 < len; i += 8) {
-                long l1 = UNSAFE.getLong(base1 + i);
-                long l2 = UNSAFE.getLong(base2 + i);
+                long l1 = UNSAFE.getLong(offset + i);
+                long l2 = UNSAFE.getLong(that.offset + i);
                 if (l1 != l2) {
                     return false;
                 }
             }
             if (len >= 8) {
-                long l1 = UNSAFE.getLong(base1 + len - 8);
-                long l2 = UNSAFE.getLong(base2 + len - 8);
+                long l1 = UNSAFE.getLong(offset + len - 8);
+                long l2 = UNSAFE.getLong(that.offset + len - 8);
                 return l1 == l2;
             }
             for (; i < len; i++) {
-                byte i1 = UNSAFE.getByte(base1 + i);
-                byte i2 = UNSAFE.getByte(base2 + i);
+                byte i1 = UNSAFE.getByte(offset + i);
+                byte i2 = UNSAFE.getByte(that.offset + i);
                 if (i1 != i2) {
                     return false;
                 }
@@ -265,7 +277,7 @@ public boolean equals(Object o) {
         @Override
         public int hashCode() {
             if (hash == 0) {
-                long h = UNSAFE.getLong(ms.address() + offset);
+                long h = UNSAFE.getLong(offset);
                 h = Long.reverseBytes(h) >>> (8 * Math.max(0, 8 - len));
                 hash = (int) (h ^ (h >>> 32));
             }

From f5c97506119475e67b1fdd790d8a1eb834137da4 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Fri, 19 Jan 2024 17:21:37 +0100
Subject: [PATCH 062/268] Leaderboard update

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b86823432..3de3ef7e7 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) |  |
 |   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
@@ -59,11 +60,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
 |   | 00:05.089 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
-|   | 00:05.175 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
+|   | 00:05.283 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
-|   | 00:05.764 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
@@ -110,13 +110,14 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
 |   | 00:12.582 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
-|   | 00:13.763 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
+|   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
 |   | 00:14.225 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
+|   | 00:17.179 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java)| 21.0.1-open | [Farid](https://github.com/faridtmammadov) |  |
@@ -124,7 +125,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:17.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hallvard.java)| 21.0.1-open | [Hallvard Trætteberg](https://github.com/hallvard) |  |
 |   | 00:17.932 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plbpietrz.java)| 21.0.1-open | [Bartłomiej Pietrzyk](https://github.com/plbpietrz) |  |
 |   | 00:18.251 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_seijikun.java)| 21.0.1-graal | [Markus Ebner](https://github.com/seijikun) |  |
-|   | 00:18.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
 |   | 00:18.448 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_moysesb.java)| 21.0.1-open | [Moysés Borges Furtado](https://github.com/moysesb) |  |
 |   | 00:18.771 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_davecom.java)| 21.0.1-graal | [David Kopec](https://github.com/davecom) |  |
 |   | 00:18.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_maximz101.java)| 21.0.1-graal | [Maxime](https://github.com/maximz101) |  |

From 836f0805ad40956f704d80caa068c460fc30da5b Mon Sep 17 00:00:00 2001
From: Antonio Goncalves <antonio.goncalves@gmail.com>
Date: Fri, 19 Jan 2024 21:26:12 +0100
Subject: [PATCH 063/268] GitHub Copilot Chat with the help of agoncal (#460)

* v1 - Initial prompt

* Introduce Records

* v1 - Initial prompt

* v2 - Introduce Records

* v3 - Improves code

* v4 - Improves JVM parameter

* GitHub Copilot Chat with the help of agoncal

* Format

* Pass measurements-rounding

* Added prepare script
---
 calculate_average_agoncal.sh                  |  21 +++
 prepare_agoncal.sh                            |  19 +++
 .../onebrc/CalculateAverage_agoncal.java      | 153 ++++++++++++++++++
 3 files changed, 193 insertions(+)
 create mode 100755 calculate_average_agoncal.sh
 create mode 100755 prepare_agoncal.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_agoncal.java

diff --git a/calculate_average_agoncal.sh b/calculate_average_agoncal.sh
new file mode 100755
index 000000000..9a295fc46
--- /dev/null
+++ b/calculate_average_agoncal.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# sdk use java 21.0.1-tem
+
+JAVA_OPTS="--enable-preview -XX:+UseShenandoahGC -XX:+UseStringDeduplication -da"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_agoncal
\ No newline at end of file
diff --git a/prepare_agoncal.sh b/prepare_agoncal.sh
new file mode 100755
index 000000000..d2a3c6ba1
--- /dev/null
+++ b/prepare_agoncal.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-tem 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_agoncal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_agoncal.java
new file mode 100644
index 000000000..fe6a0a64d
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_agoncal.java
@@ -0,0 +1,153 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This is the solution from GitHut Copilot Chat with the help of Antonio Goncalves (prompting and guiding, but trying not to change code directly on my own, always using Copilot).
+ * <p>
+ * List of prompts that has been used:
+ * <p>
+ * =============
+ * =============
+ * =============
+ * v1 - 73603 ms
+ * You are entering The One Billion Row Challenge (1BRC) which is an exploration of how far modern Java can be pushed for aggregating one billion rows from a text file. Grab all the (virtual) threads, reach out to SIMD, optimize the GC, or pull any other trick, and create the fastest implementation for solving this task!
+ * The text file contains temperature values for a range of weather stations. Each row is one measurement in the format <string: station name>;<double: measurement>, with the measurement value having exactly one fractional digit. The following delimited with --- shows ten rows as an example:
+ * ---
+ * Hamburg;12.0
+ * Bulawayo;8.9
+ * Palembang;38.8
+ * St. John's;15.2
+ * Cracow;12.6
+ * Bridgetown;26.9
+ * Istanbul;6.2
+ * Roseau;34.4
+ * Conakry;31.2
+ * Istanbul;23.0
+ * ---
+ * You have to write a Java program which reads the file, calculates the min, mean, and max temperature value per weather station, and emits the results on stdout like the result below delimited by --- (i.e. sorted alphabetically by station name, and the result values per station in the format <min>/<mean>/<max>, rounded to one fractional digit). Notice the curly braces:
+ * ---
+ * {Abha=-23.0/18.0/59.2, Abidjan=-16.2/26.0/67.3, Abéché=-10.0/29.4/69.0, Accra=-10.1/26.4/66.4, Addis Ababa=-23.7/16.0/67.0, Adelaide=-27.8/17.3/58.5, ...}
+ * ---
+ * You must use Java 21.
+ * Create an algorithm in any way you see fit including parallelizing the computation, using the (incubating) Vector API, memory-mapping different sections of the file concurrently, using AppCDS, GraalVM, CRaC, etc. for speeding up the application start-up, choosing and tuning the garbage collector, and much more.
+ * No external library dependencies may be used.
+ * =============
+ * =============
+ * =============
+ * (Here I had to chat with Copilot about formatting the output, there were commas missing, the curly brackets were also missed)
+ * =============
+ * =============
+ * =============
+ * v2 - 71831 ms
+ * Being written in Java 21, please use records instead of classes for Measurement.
+ * =============
+ * =============
+ * =============
+ * v3 - 69333 ms
+ * If the temperatures are small numbers, why use double? Can't you use another datatype ?
+ * <p>
+ * The profiler mentions that this line of code has very bad performance. Can you refactor it so it has better performance:
+ * ---
+ * String[] parts = line.split(";")
+ * ---
+ * <p>
+ * There is a maximum of 10000 unique station names. Can you optimize the code taking this into account?
+ * =============
+ * =============
+ * =============
+ * v4 - 56417 ms
+ * Which parameters can I pass to the JVM to make it run faster ?
+ * Which GC can I use and what is the most optimized to run CalculateAverage ?
+ */
+public class CalculateAverage_agoncal {
+
+    private static final String FILE = "./measurements.txt";
+
+    record Measurement(String station, double temperature) {
+    }
+
+    static class StationStats {
+        double min;
+        double max;
+        double sum;
+        int count;
+
+        public StationStats(double temperature) {
+            this.min = temperature;
+            this.max = temperature;
+            this.sum = 0;
+            this.count = 0;
+        }
+
+        synchronized void update(double temperature) {
+            min = Math.min(min, temperature);
+            max = Math.max(max, temperature);
+            sum += temperature;
+            count++;
+        }
+
+        double getAverage() {
+            return round(sum) / count;
+        }
+
+        @Override
+        public String toString() {
+            return String.format("%.1f/%.1f/%.1f", round(min), round(getAverage()), round(max));
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        Map<String, StationStats> stats = new ConcurrentHashMap<>(10_000);
+        try (BufferedReader reader = Files.newBufferedReader(Paths.get(FILE))) {
+            reader.lines().parallel().forEach(line -> {
+                int separatorIndex = line.indexOf(';');
+                String station = line.substring(0, separatorIndex);
+                String temperature = line.substring(separatorIndex + 1);
+                Measurement m = new Measurement(station, Double.parseDouble(temperature));
+                stats.computeIfAbsent(m.station, k -> new StationStats(m.temperature)).update(m.temperature);
+            });
+        }
+
+        TreeMap<String, StationStats> sortedStats = new TreeMap<>(stats);
+        Iterator<Map.Entry<String, StationStats>> iterator = sortedStats.entrySet().iterator();
+        System.out.print("{");
+        while (iterator.hasNext()) {
+            Map.Entry<String, StationStats> entry = iterator.next();
+            StationStats s = entry.getValue();
+            if (iterator.hasNext()) {
+                System.out.printf("%s=%s, ", entry.getKey(), s.toString());
+            }
+            else {
+                System.out.printf("%s=%s", entry.getKey(), s.toString());
+            }
+        }
+        System.out.println("}");
+    }
+
+    private static double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+}

From f6bcaae4b99bca976e5facefb20649ea085a458d Mon Sep 17 00:00:00 2001
From: kumarsaurav123 <kumar.saurav@eko.co.in>
Date: Sat, 20 Jan 2024 02:05:25 +0530
Subject: [PATCH 064/268] kumarsaurav123  # Attempt 3 (#470)

* Use Memory Segment

* Reduce Number of threads
---
 calculate_average_kumarsaurav123.sh           |   2 +-
 .../CalculateAverage_kumarsaurav123.java      | 277 +++++++++---------
 2 files changed, 138 insertions(+), 141 deletions(-)

diff --git a/calculate_average_kumarsaurav123.sh b/calculate_average_kumarsaurav123.sh
index 1c823e5bd..4567dcf28 100755
--- a/calculate_average_kumarsaurav123.sh
+++ b/calculate_average_kumarsaurav123.sh
@@ -16,6 +16,6 @@
 #
 
 
-JAVA_OPTS="-Xms6G -Xmx16G"
+JAVA_OPTS="-Xms16G -Xmx32G --enable-preview"
 
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_kumarsaurav123
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java b/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java
index 5b59d057c..f991f9f8b 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java
@@ -15,18 +15,20 @@
  */
 package dev.morling.onebrc;
 
+import java.io.IOException;
 import java.io.RandomAccessFile;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collector;
-import java.util.stream.IntStream;
 
 import static java.util.stream.Collectors.groupingBy;
 
@@ -40,7 +42,10 @@ private Measurement(String[] parts) {
         }
     }
 
-    private static record ResultRow(String station,double min, double mean, double max,double sum,double count) {
+    private static record Pair(long start, int size) {
+    }
+
+    private static record ResultRow(String station, double min, double mean, double max, double sum, double count) {
         public String toString() {
             return round(min) + "/" + round(mean) + "/" + round(max);
         }
@@ -61,18 +66,13 @@ private static class MeasurementAggregator {
         private String station;
     }
 
-    public static void main(String[] args) {
-        HashMap<Byte, Integer> map = new HashMap<>();
-        map.put((byte) 48, 0);
-        map.put((byte) 49, 1);
-        map.put((byte) 50, 2);
-        map.put((byte) 51, 3);
-        map.put((byte) 52, 4);
-        map.put((byte) 53, 5);
-        map.put((byte) 54, 6);
-        map.put((byte) 55, 7);
-        map.put((byte) 56, 8);
-        map.put((byte) 57, 9);
+    public static void main(String[] args) throws IOException {
+        long start = System.currentTimeMillis();
+        System.out.println(run(FILE));
+        // System.out.println(System.currentTimeMillis() - start);
+    }
+
+    public static String run(String filePath) throws IOException {
         Collector<ResultRow, MeasurementAggregator, ResultRow> collector2 = Collector.of(
                 MeasurementAggregator::new,
                 (a, m) -> {
@@ -91,7 +91,7 @@ public static void main(String[] args) {
                     return res;
                 },
                 agg -> {
-                    return new ResultRow(agg.station, agg.min, agg.sum / agg.count, agg.max, agg.sum, agg.count);
+                    return new ResultRow(agg.station, agg.min, (Math.round(agg.sum * 10.0) / 10.0) / agg.count, agg.max, agg.sum, agg.count);
                 });
         Collector<Measurement, MeasurementAggregator, ResultRow> collector = Collector.of(
                 MeasurementAggregator::new,
@@ -114,143 +114,140 @@ public static void main(String[] args) {
                 agg -> {
                     return new ResultRow(agg.station, agg.min, agg.sum / agg.count, agg.max, agg.sum, agg.count);
                 });
-
-        long start = System.currentTimeMillis();
-        long len = Paths.get(FILE).toFile().length();
-        Map<Integer, List<byte[]>> leftOutsMap = new ConcurrentSkipListMap<>();
-        int chunkSize = 1_0000_00;
-        long proc = Math.max(1, (len / chunkSize));
-        ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2 * 2 * 2);
+        ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2);
         List<ResultRow> measurements = Collections.synchronizedList(new ArrayList<ResultRow>());
-        IntStream.range(0, (int) proc)
-                .mapToObj(i -> {
-                    return new Runnable() {
-                        @Override
-                        public void run() {
-                            try {
-                                RandomAccessFile file = new RandomAccessFile(FILE, "r");
-                                byte[] allBytes2 = new byte[chunkSize];
-                                file.seek((long) i * (long) chunkSize);
-                                int l = file.read(allBytes2);
-                                byte[] eol = "\n".getBytes(StandardCharsets.UTF_8);
-                                byte[] sep = ";".getBytes(StandardCharsets.UTF_8);
-
-                                List<Measurement> mst = new ArrayList<>();
-                                int st = 0;
-                                int cnt = 0;
-                                ArrayList<byte[]> local = new ArrayList<>();
-
-                                for (int i = 0; i < l; i++) {
-                                    if (allBytes2[i] == eol[0]) {
-                                        if (i != 0) {
-                                            byte[] s2 = new byte[i - st];
-                                            System.arraycopy(allBytes2, st, s2, 0, s2.length);
-                                            if (cnt != 0) {
-                                                for (int j = 0; j < s2.length; j++) {
-                                                    if (s2[j] == sep[0]) {
-                                                        byte[] city = new byte[j];
-                                                        byte[] value = new byte[s2.length - j - 1];
-                                                        System.arraycopy(s2, 0, city, 0, city.length);
-                                                        System.arraycopy(s2, city.length + 1, value, 0, value.length);
-                                                        double d = 0.0;
-                                                        int s = -1;
-                                                        for (int k = value.length - 1; k >= 0; k--) {
-                                                            if (value[k] == 45) {
-                                                                d = d * -1;
-                                                            }
-                                                            else if (value[k] == 46) {
-                                                            }
-                                                            else {
-                                                                d = d + map.get(value[k]).intValue() * Math.pow(10, s);
-                                                                s++;
-                                                            }
-                                                        }
-                                                        mst.add(new Measurement(new String(city), d));
-
-                                                    }
-                                                }
-
-                                            }
-                                            else {
-                                                local.add(s2);
-                                            }
+        int chunkSize = 1_0000_00;
+        Map<Integer, List<byte[]>> leftOutsMap = new ConcurrentSkipListMap<>();
+        RandomAccessFile file = new RandomAccessFile(filePath, "r");
+        long filelength = file.length();
+        AtomicInteger kk = new AtomicInteger();
+        MemorySegment memorySegment = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, filelength, Arena.global());
+        int nChunks = 1000;
+
+        int pChunkSize = Math.min(Integer.MAX_VALUE, (int) (memorySegment.byteSize() / (1000 * 20)));
+        if (pChunkSize < 100) {
+            pChunkSize = (int) memorySegment.byteSize();
+            nChunks = 1;
+        }
+        ArrayList<Pair> chunks = createStartAndEnd(pChunkSize, nChunks, memorySegment);
+        chunks.stream()
+                .map(p -> {
 
-                                        }
-                                        cnt++;
-                                        st = i + 1;
-                                    }
-                                }
-                                if (st < l) {
-                                    byte[] s2 = new byte[allBytes2.length - st];
-                                    System.arraycopy(allBytes2, st, s2, 0, s2.length);
-                                    local.add(s2);
-                                }
-                                leftOutsMap.put(i, local);
-                                allBytes2 = null;
-                                measurements.addAll(mst.stream()
-                                        .collect(groupingBy(Measurement::station, collector))
-                                        .values());
-                                // System.out.println(measurements.size());
-                            }
-                            catch (Exception e) {
-                                // throw new RuntimeException(e);
-                                System.out.println("");
-                            }
-                        }
-                    };
+                    return createRunnable(memorySegment, p, collector, measurements, kk.getAndIncrement());
                 })
-                .forEach(executor::submit);
-        executor.shutdown();
-
+                .forEach(executorService::submit);
+        executorService.shutdown();
         try {
-            executor.awaitTermination(10, TimeUnit.MINUTES);
+            executorService.awaitTermination(10, TimeUnit.MINUTES);
         }
         catch (InterruptedException e) {
             throw new RuntimeException(e);
         }
-        Collection<Measurement> lMeasure = new ArrayList<>();
-        List<byte[]> leftOuts = leftOutsMap.values()
-                .stream()
-                .flatMap(List::stream)
-                .toList();
-        int size = 0;
-        for (int i = 0; i < leftOuts.size(); i++) {
-            size = size + leftOuts.get(i).length;
-        }
-        byte[] allBytes = new byte[size];
-        int pos = 0;
-        for (int i = 0; i < leftOuts.size(); i++) {
-            System.arraycopy(leftOuts.get(i), 0, allBytes, pos, leftOuts.get(i).length);
-            pos = pos + leftOuts.get(i).length;
-        }
-        List<String> l = Arrays.asList(new String(allBytes).split(";"));
-        List<Measurement> measurements1 = new ArrayList<>();
-        String city = l.get(0);
-        for (int i = 0; i < l.size() - 1; i++) {
-            int sIndex = l.get(i + 1).indexOf('.') + 2;
-
-            String tempp = l.get(i + 1).substring(0, sIndex);
 
-            measurements1.add(new Measurement(city, Double.parseDouble(tempp)));
-            city = l.get(i + 1).substring(sIndex);
-        }
-        measurements.addAll(measurements1.stream()
-                .collect(groupingBy(Measurement::station, collector))
-                .values());
         Map<String, ResultRow> measurements2 = new TreeMap<>(measurements
                 .stream()
                 .parallel()
                 .collect(groupingBy(ResultRow::station, collector2)));
+        return measurements2.toString();
+    }
 
-        // Read from bytes 1000 to 2000
-        // Something like this
+    private static ArrayList<Pair> createStartAndEnd(int chunksize, int nChunks, MemorySegment memorySegment) {
+        ArrayList<Pair> startSizePairs = new ArrayList<>();
+        byte eol = "\n".getBytes(StandardCharsets.UTF_8)[0];
+        long start = 0;
+        long end = -1;
+        if (nChunks == 1) {
+            startSizePairs.add(new Pair(0, chunksize));
+            return startSizePairs;
+        }
+        else {
+            while (start < memorySegment.byteSize()) {
+                start = end + 1;
+                end = Math.min(memorySegment.byteSize() - 1, start + chunksize - 1);
+                while (memorySegment.get(ValueLayout.JAVA_BYTE, end) != eol) {
+                    end--;
+
+                }
+                startSizePairs.add(new Pair(start, (int) (end - start + 1)));
+            }
+        }
+        return startSizePairs;
+    }
 
-        //
-        // Map<String, ResultRow> measurements = new TreeMap<>(Files.lines(Paths.get(FILE))
-        // .map(l -> new Measurement(l.split(";")))
-        // .collect(groupingBy(m -> m.station(), collector)));
+    public static Runnable createRunnable(MemorySegment memorySegment, Pair p, Collector<Measurement, MeasurementAggregator, ResultRow> collector,
+                                          List<ResultRow> measurements, int kk) {
+        return new Runnable() {
+            @Override
+            public void run() {
+                try {
+                    long start = System.currentTimeMillis();
+
+                    byte[] allBytes2 = new byte[p.size];
+                    MemorySegment lMemory = memorySegment.asSlice(p.start, p.size);
+                    lMemory.asByteBuffer().get(allBytes2);
+                    HashMap<Byte, Integer> map = new HashMap<>();
+                    // Runtime runtime = Runtime.getRuntime();
+                    // long memoryMax = runtime.maxMemory();
+                    // long memoryUsed = runtime.totalMemory() - runtime.freeMemory();
+                    // double memoryUsedPercent = (memoryUsed * 100.0) / memoryMax;
+                    // System.out.println("memoryUsedPercent: " + memoryUsedPercent);
+                    map.put((byte) 48, 0);
+                    map.put((byte) 49, 1);
+                    map.put((byte) 50, 2);
+                    map.put((byte) 51, 3);
+                    map.put((byte) 52, 4);
+                    map.put((byte) 53, 5);
+                    map.put((byte) 54, 6);
+                    map.put((byte) 55, 7);
+                    map.put((byte) 56, 8);
+                    map.put((byte) 57, 9);
+                    byte[] eol = "\n".getBytes(StandardCharsets.UTF_8);
+                    byte[] sep = ";".getBytes(StandardCharsets.UTF_8);
+
+                    List<Measurement> mst = new ArrayList<>();
+                    int st = 0;
+
+                    for (int i = 0; i < allBytes2.length; i++) {
+                        if (allBytes2[i] == eol[0]) {
+                            byte[] s2 = new byte[i - st];
+                            System.arraycopy(allBytes2, st, s2, 0, s2.length);
+                            for (int j = 0; j < s2.length; j++) {
+                                if (s2[j] == sep[0]) {
+                                    byte[] city = new byte[j];
+                                    byte[] value = new byte[s2.length - j - 1];
+                                    System.arraycopy(s2, 0, city, 0, city.length);
+                                    System.arraycopy(s2, city.length + 1, value, 0, value.length);
+                                    double d = 0.0;
+                                    int s = -1;
+                                    for (int k = value.length - 1; k >= 0; k--) {
+                                        if (value[k] == 45) {
+                                            d = d * -1;
+                                        }
+                                        else if (value[k] == 46) {
+                                        }
+                                        else {
+                                            d = d + map.get(value[k]).intValue() * Math.pow(10, s);
+                                            s++;
+                                        }
+                                    }
+                                    mst.add(new Measurement(new String(city), d));
 
-        System.out.println(measurements2);
-        // System.out.println(System.currentTimeMillis() - start);
+                                }
+                            }
+                            st = i + 1;
+                        }
+                    }
+                    // System.out.println("Task " + kk + "Completed in " + (System.currentTimeMillis() - start));
+                    measurements.addAll(mst.stream()
+                            .collect(groupingBy(Measurement::station, collector))
+                            .values());
+
+                }
+                catch (Exception e) {
+                    // throw new RuntimeException(e);
+                    System.out.println("");
+                }
+            }
+        };
     }
 }

From ce8fe41bd4b0c8c15b95bccd79dc652171d5505f Mon Sep 17 00:00:00 2001
From: Jin Cong Ho <jincongho@gmail.com>
Date: Fri, 19 Jan 2024 20:40:05 +0000
Subject: [PATCH 065/268] Submission #3: jincongho (#482)

---
 .../onebrc/CalculateAverage_jincongho.java    | 71 +++++++++++++------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
index d2a7e6609..0758703bc 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java
@@ -31,7 +31,6 @@
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
 
 /**
  * Changelog (based on Macbook Pro Intel i7 6-cores 2.6GHz):
@@ -123,13 +122,31 @@ public static int findDelimiter(MemorySegment data, long offset) {
         // }
 
         // scalar implementation
+        // public static int hashCode(final MemorySegment array, final long offset, final short length) {
+        // final long limit = offset + length;
+        // int h = 1;
+        // for (long i = offset; i < limit; i++) {
+        // h = 31 * h + UNSAFE.getByte(array.address() + i);
+        // }
+        // return h;
+        // }
+
+        // fxhash
         public static int hashCode(final MemorySegment array, final long offset, final short length) {
-            final long limit = offset + length;
-            int h = 1;
-            for (long i = offset; i < limit; i++) {
-                h = 31 * h + UNSAFE.getByte(array.address() + i);
+            final int seed = 0x9E3779B9;
+            final int rotate = 5;
+
+            int x, y;
+            if (length >= Integer.BYTES) {
+                x = UNSAFE.getInt(array.address() + offset);
+                y = UNSAFE.getInt(array.address() + offset + length - Integer.BYTES);
             }
-            return h;
+            else {
+                x = UNSAFE.getByte(array.address() + offset);
+                y = UNSAFE.getByte(array.address() + offset + length - Byte.BYTES);
+            }
+
+            return (Integer.rotateLeft(x * seed, rotate) ^ y) * seed;
         }
 
         /** Vectorized Key Comparison **/
@@ -209,7 +226,7 @@ public void update(MemorySegment key, long keyStart, short keyLength, int keyHas
                 }
                 else {
                     index = (index + 1) & KEY_MASK;
-                    keyOffset += KEY_SIZE;
+                    keyOffset = KEYS.address() + (index * KEY_SIZE);
                 }
             }
 
@@ -254,7 +271,7 @@ public void mergeTo(ResultAggr result) {
      * Measurement Aggregation (for all partitions)
      * Simple Concurrent Hash Table so all partitions can merge concurrently
      */
-    protected static class ResultAggr extends ConcurrentHashMap<ResultAggr.ByteKey, ResultAggr.Measurement> {
+    protected static class ResultAggr extends HashMap<ResultAggr.ByteKey, ResultAggr.Measurement> {
 
         public static class ByteKey implements Comparable<ByteKey> {
             private final MemorySegment data;
@@ -270,10 +287,8 @@ public ByteKey(MemorySegment data, long offset, short length) {
 
             @Override
             public boolean equals(Object other) {
-                if (length != ((ByteKey) other).length)
-                    return false;
-
-                return !VectorUtils.notEquals(data, offset, ((ByteKey) other).data, ((ByteKey) other).offset, length, VectorUtils.BYTE_SPECIES);
+                return (length == ((ByteKey) other).length)
+                        && !VectorUtils.notEquals(data, offset, ((ByteKey) other).data, ((ByteKey) other).offset, length, VectorUtils.BYTE_SPECIES);
             }
 
             @Override
@@ -311,8 +326,8 @@ public String toString() {
 
         }
 
-        public ResultAggr(int initialCapacity, float loadFactor, int concurrencyLevel) {
-            super(initialCapacity, loadFactor, concurrencyLevel);
+        public ResultAggr(int initialCapacity, float loadFactor) {
+            super(initialCapacity, loadFactor);
         }
 
         public Map toSorted() {
@@ -326,9 +341,9 @@ protected static class Partition implements Runnable {
         private final MemorySegment data;
         private long offset;
         private final long limit;
-        private final ResultAggr result;
+        private final PartitionAggr result;
 
-        public Partition(MemorySegment data, long offset, long limit, ResultAggr result) {
+        public Partition(MemorySegment data, long offset, long limit, PartitionAggr result) {
             this.data = data;
             this.offset = offset;
             this.limit = limit;
@@ -338,7 +353,7 @@ public Partition(MemorySegment data, long offset, long limit, ResultAggr result)
         @Override
         public void run() {
             // measurement parsing
-            PartitionAggr aggr = new PartitionAggr();
+            final PartitionAggr aggr = this.result;
 
             // main loop (vectorized)
             final long loopLimit = limit - (VectorUtils.BYTE_SPECIES.length() * Math.ceilDiv(100, VectorUtils.BYTE_SPECIES.length()) + Long.BYTES);
@@ -402,7 +417,7 @@ public void run() {
             }
 
             // measurement result collection
-            aggr.mergeTo(result);
+            // aggr.mergeTo(result);
         }
 
     }
@@ -435,15 +450,25 @@ public static void main(String[] args) throws IOException, InterruptedException
 
             // partition aggregation
             var threadList = new Thread[processors];
-            ResultAggr result = new ResultAggr(1 << 14, 1, processors);
+            PartitionAggr[] partAggrs = new PartitionAggr[processors];
             for (int i = 0; i < processors; i++) {
-                threadList[i] = new Thread(new Partition(data, partition[i], partition[i + 1], result));
+                if (partition[i] == data.byteSize())
+                    break;
+
+                partAggrs[i] = new PartitionAggr();
+                threadList[i] = new Thread(new Partition(data, partition[i], partition[i + 1], partAggrs[i]));
                 threadList[i].start();
             }
-            for (var thread : threadList) {
-                thread.join();
-            }
 
+            // result
+            ResultAggr result = new ResultAggr(1 << 14, 1);
+            for (int i = 0; i < processors; i++) {
+                if (partition[i] == data.byteSize())
+                    break;
+
+                threadList[i].join();
+                partAggrs[i].mergeTo(result);
+            }
             System.out.println(result.toSorted());
         }
 

From 144a6af1645d8ae9b302463f3ad472a5b8a50d62 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Fri, 19 Jan 2024 20:44:22 +0000
Subject: [PATCH 066/268] netrunnereve: more optimizations (#485)

---
 .../onebrc/CalculateAverage_netrunnereve.java | 120 ++++++++++--------
 1 file changed, 67 insertions(+), 53 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java b/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java
index e323a32ad..13919cfb6 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java
@@ -21,15 +21,18 @@
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
-import java.lang.Math;
 import java.util.Map;
 import java.util.TreeMap;
+import java.util.concurrent.CountDownLatch;
+import java.lang.Math;
 
 public class CalculateAverage_netrunnereve {
 
     private static final String FILE = "./measurements.txt";
     private static final int NUM_THREADS = 8; // test machine
     private static final int LEN_EXTEND = 200; // guarantees a newline
+    private static final int HASHT_SIZE = 16384; // size of hash table, adjust tradeoff between colisions and cache utilization
+    private static final int DJB2_INIT = 5831;
 
     private static class MeasurementAggregator { // min, max, sum stored as 0.1/unit
         private MeasurementAggregator next = null; // linked list of entries for handling hash colisions
@@ -48,11 +51,11 @@ private static class ThreadCalcs {
 
     // djb2 hash
     private static int calc_hash(byte[] input, int len) {
-        int hash = 5831;
+        int hash = DJB2_INIT;
         for (int i = 0; i < len; i++) {
             hash = ((hash << 5) + hash) + Byte.toUnsignedInt(input[i]);
         }
-        return Math.abs(hash % 16384);
+        return Math.abs(hash % HASHT_SIZE);
     }
 
     private static class ThreadedParser extends Thread {
@@ -60,65 +63,39 @@ private static class ThreadedParser extends Thread {
         private int mbs;
         private ThreadCalcs[] threadOut;
         private int threadID;
+        private CountDownLatch tpLatch;
 
-        private ThreadedParser(MappedByteBuffer mbuf, int mbs, ThreadCalcs[] threadOut, int threadID) {
+        private ThreadedParser(MappedByteBuffer mbuf, int mbs, ThreadCalcs[] threadOut, int threadID, CountDownLatch tpLatch) {
             this.mbuf = mbuf;
             this.mbs = mbs;
             this.threadOut = threadOut;
             this.threadID = threadID;
+            this.tpLatch = tpLatch;
         }
 
         public void run() {
-            MeasurementAggregator[] hashSpace = new MeasurementAggregator[16384]; // 14-bit hash
+            MeasurementAggregator[] hashSpace = new MeasurementAggregator[HASHT_SIZE]; // hash table
             byte[] scratch = new byte[100]; // <= 100 characters in station name
             String[] staArr = new String[10000]; // max 10000 station names
             MeasurementAggregator ma = null;
 
             int numStations = 0;
-            boolean state = false; // 0 for station pickup, 1 for measurement pickup
             int negMul = 1;
             int head = 0;
             int tempCnt = -1; // 0 if 1 digit measurement, 1 if 2 digit
+            int hash = DJB2_INIT; // do calc_hash manually in loop
 
-            for (int i = 0; i < mbs; i++) {
+            int i = 0; // byte by byte iterator
+            while (true) {
                 byte cur = mbuf.get(i);
-                if (state == true) {
-                    if (cur == 46) { // .
-                        int tempa = mbuf.get(i + 1) - 48;
-                        tempa += (scratch[0] - 48) * (10 + 90 * tempCnt) + (scratch[1] - 48) * (10 * tempCnt); // branchless
-                        tempa *= negMul;
-
-                        if (tempa < ma.min) {
-                            ma.min = tempa;
-                        }
-                        if (tempa > ma.max) {
-                            ma.max = tempa;
-                        }
-                        ma.sum += tempa;
-                        ma.count++;
-
-                        i += 2; // go to start of new line
-                        state = false;
-                        negMul = 1;
-                        head = i + 1;
-                        tempCnt = -1;
-                    }
-                    else if (cur == 45) { // ascii -
-                        negMul = -1;
-                    }
-                    else {
-                        scratch[tempCnt + 1] = cur;
-                        tempCnt++;
-                    }
-                }
-                else if (cur == 59) { // ;
-                    int len = i - head;
+                if (cur == 59) { // ;
+                    hash = Math.abs(hash % HASHT_SIZE);
 
                     // this is faster than filling scratch immediately after each byte is read
+                    int len = i - head;
                     mbuf.position(head);
                     mbuf.get(scratch, 0, len);
 
-                    int hash = calc_hash(scratch, len);
                     ma = hashSpace[hash];
                     MeasurementAggregator prev = null;
 
@@ -146,14 +123,53 @@ else if ((len != ma.station.length) || (Arrays.compare(scratch, 0, len, ma.stati
                             break;
                         }
                     }
-                    state = true;
-                    head = i + 1;
+
+                    i++;
+                    while (true) {
+                        cur = mbuf.get(i);
+                        if (cur == 46) { // .
+                            int tempa = (negMul) * ((10 + 90 * tempCnt) * (scratch[0] - 48) + (10 * tempCnt) * (scratch[1] - 48) + (mbuf.get(i + 1) - 48)); // branchless
+
+                            if (tempa < ma.min) {
+                                ma.min = tempa;
+                            }
+                            if (tempa > ma.max) {
+                                ma.max = tempa;
+                            }
+                            ma.sum += tempa;
+                            ma.count++;
+
+                            // this line is finished!
+                            i += 2; // newline char
+                            hash = DJB2_INIT;
+                            negMul = 1;
+                            head = i + 1; // start of next line
+                            tempCnt = -1;
+                            break;
+                        }
+                        else if (cur == 45) { // ascii -
+                            negMul = -1;
+                        }
+                        else {
+                            scratch[tempCnt + 1] = cur;
+                            tempCnt++;
+                        }
+                        i++;
+                    }
+                    if (head >= mbs) {
+                        break;
+                    }
                 }
+                else {
+                    hash = ((hash << 5) + hash) + Byte.toUnsignedInt(cur);
+                }
+                i++;
             }
             threadOut[threadID] = new ThreadCalcs();
             threadOut[threadID].hashSpace = hashSpace;
             threadOut[threadID].staArr = staArr;
             threadOut[threadID].numStations = numStations;
+            tpLatch.countDown();
         }
     }
 
@@ -175,8 +191,8 @@ public static void main(String[] args) {
                 bufSize = Integer.MAX_VALUE;
             }
 
-            ThreadedParser[] myThreads = new ThreadedParser[(int) threadNum];
             ThreadCalcs[] threadOut = new ThreadCalcs[(int) threadNum];
+            CountDownLatch tpLatch = new CountDownLatch((int) threadNum);
             int threadID = 0;
 
             long h = 0;
@@ -206,27 +222,25 @@ public static void main(String[] args) {
                     }
                 }
 
-                myThreads[threadID] = new ThreadedParser(mbuf, mbs, threadOut, threadID);
-                myThreads[threadID].start();
+                ThreadedParser tpThr = new ThreadedParser(mbuf, mbs, threadOut, threadID, tpLatch);
+                tpThr.start();
 
                 h += mbs;
                 threadID++;
             }
 
-            for (int i = 0; i < threadID; i++) {
-                try {
-                    myThreads[i].join();
-                }
-                catch (InterruptedException ex) {
-                    System.exit(1);
-                }
+            try {
+                tpLatch.await();
+            }
+            catch (InterruptedException ex) {
+                System.exit(1);
             }
 
             // use treemap to sort and uniquify
-            Map<String, Integer> staMap = new TreeMap<>();
+            Map<String, Boolean> staMap = new TreeMap<>();
             for (int i = 0; i < threadID; i++) {
                 for (int j = 0; j < threadOut[i].numStations; j++) {
-                    staMap.put(threadOut[i].staArr[j], 0);
+                    staMap.put(threadOut[i].staArr[j], false);
                 }
             }
 

From 6e3893c6a60ba8c514601e41f61b1d8240e2b8b5 Mon Sep 17 00:00:00 2001
From: Vemana <vemana.github@gmail.com>
Date: Sat, 20 Jan 2024 02:17:55 +0530
Subject: [PATCH 067/268] Reduce variance by (1) Using common chunks at the end
 (2) Busy looping (#486)

on automatic closing of ByteBuffers.. previously, a straggler could hold
up closing the ByteBuffers.

Also
- Improve Tracing code
- Parametrize additional options to aid in tuning

Our previous PR was surprising; parallelizing munmap() call did not
yield anywhere near the performance gain I expected. Local machine had
10% gain while testing machine only showed 2% gain. I am still not clear
why it happened and the two best theories I have are
1) Variance due to stragglers (that this change addresses)
2) munmap() is either too fast or too slow relative to the other
   instructions compared to our local machine. I don't know which. We'll
   have to use adaptive tuning, but that's in a different change.
---
 .../onebrc/CalculateAverage_vemana.java       | 257 ++++++++++++------
 1 file changed, 169 insertions(+), 88 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
index 8f690e349..3e64ac905 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java
@@ -171,7 +171,7 @@ public static void main(String[] args) throws Exception {
         int chunkSizeBits = 20;
 
         // For the last commonChunkFraction fraction of total work, use smaller chunk sizes
-        double commonChunkFraction = 0;
+        double commonChunkFraction = 0.03;
 
         // Use commonChunkSizeBits for the small-chunk size
         int commonChunkSizeBits = 18;
@@ -181,11 +181,17 @@ public static void main(String[] args) throws Exception {
 
         int minReservedBytesAtFileTail = 9;
 
+        int nThreads = -1;
+
         String inputFile = "measurements.txt";
 
+        double munmapFraction = 0.03;
+
+        boolean fakeAdvance = false;
+
         for (String arg : args) {
-            String key = arg.substring(0, arg.indexOf('='));
-            String value = arg.substring(key.length() + 1);
+            String key = arg.substring(0, arg.indexOf('=')).trim();
+            String value = arg.substring(key.length() + 1).trim();
             switch (key) {
                 case "chunkSizeBits":
                     chunkSizeBits = Integer.parseInt(value);
@@ -202,6 +208,15 @@ public static void main(String[] args) throws Exception {
                 case "inputfile":
                     inputFile = value;
                     break;
+                case "munmapFraction":
+                    munmapFraction = Double.parseDouble(value);
+                    break;
+                case "fakeAdvance":
+                    fakeAdvance = Boolean.parseBoolean(value);
+                    break;
+                case "nThreads":
+                    nThreads = Integer.parseInt(value);
+                    break;
                 default:
                     throw new IllegalArgumentException("Unknown argument: " + arg);
             }
@@ -218,14 +233,17 @@ public static void main(String[] args) throws Exception {
         System.out.println(
                 new Runner(
                         Path.of(inputFile),
+                        nThreads,
                         chunkSizeBits,
                         commonChunkFraction,
                         commonChunkSizeBits,
                         hashtableSizeBits,
-                        minReservedBytesAtFileTail)
+                        minReservedBytesAtFileTail,
+                        munmapFraction,
+                        fakeAdvance)
                                 .getSummaryStatistics());
 
-        Tracing.recordEvent("After printing result");
+        Tracing.recordEvent("Final result printed");
     }
 
   public record AggregateResult(Map<String, Stat> tempStats) {
@@ -286,8 +304,8 @@ public ByteRange(RandomAccessFile raf, long maxEndPos) {
             bufferEnd = bufferStart = -1;
         }
 
-        public void close(int shardIdx) {
-            Tracing.recordWorkStart("cleaner", shardIdx);
+        public void close(String closerId, int shardIdx) {
+            Tracing.recordWorkStart(closerId, shardIdx);
             if (byteBuffer != null) {
                 unclosedBuffers.add(byteBuffer);
             }
@@ -297,7 +315,7 @@ public void close(int shardIdx) {
             unclosedBuffers.clear();
             bufferEnd = bufferStart = -1;
             byteBuffer = null;
-            Tracing.recordWorkEnd("cleaner", shardIdx);
+            Tracing.recordWorkEnd(closerId, shardIdx);
         }
 
         public void setRange(long rangeStart, long rangeEnd) {
@@ -383,7 +401,7 @@ private Checks() {
 
     public interface LazyShardQueue {
 
-        void close(int shardIdx);
+        void close(String closerId, int shardIdx);
 
         Optional<ByteRange> fileTailEndWork(int idx);
 
@@ -415,37 +433,48 @@ public static class Runner {
 
         private final double commonChunkFraction;
         private final int commonChunkSizeBits;
+        private final boolean fakeAdvance;
         private final int hashtableSizeBits;
         private final Path inputFile;
         private final int minReservedBytesAtFileTail;
+        private final double munmapFraction;
+        private final int nThreads;
         private final int shardSizeBits;
 
         public Runner(
                       Path inputFile,
+                      int nThreads,
                       int chunkSizeBits,
                       double commonChunkFraction,
                       int commonChunkSizeBits,
                       int hashtableSizeBits,
-                      int minReservedBytesAtFileTail) {
+                      int minReservedBytesAtFileTail,
+                      double munmapFraction,
+                      boolean fakeAdvance) {
             this.inputFile = inputFile;
+            this.nThreads = nThreads;
             this.shardSizeBits = chunkSizeBits;
             this.commonChunkFraction = commonChunkFraction;
             this.commonChunkSizeBits = commonChunkSizeBits;
             this.hashtableSizeBits = hashtableSizeBits;
             this.minReservedBytesAtFileTail = minReservedBytesAtFileTail;
+            this.munmapFraction = munmapFraction;
+            this.fakeAdvance = fakeAdvance;
         }
 
         AggregateResult getSummaryStatistics() throws Exception {
-            int nThreads = Runtime.getRuntime().availableProcessors();
+            int nThreads = this.nThreads < 0 ? Runtime.getRuntime().availableProcessors() : this.nThreads;
+
             LazyShardQueue shardQueue = new SerialLazyShardQueue(
                     1L << shardSizeBits,
                     inputFile,
                     nThreads,
                     commonChunkFraction,
                     commonChunkSizeBits,
-                    minReservedBytesAtFileTail);
+                    minReservedBytesAtFileTail,
+                    munmapFraction,
+                    fakeAdvance);
 
-            List<Future<AggregateResult>> results = new ArrayList<>();
             ExecutorService executorService = Executors.newFixedThreadPool(
                     nThreads,
                     runnable -> {
@@ -454,42 +483,56 @@ AggregateResult getSummaryStatistics() throws Exception {
                         return thread;
                     });
 
+            List<Future<AggregateResult>> results = new ArrayList<>();
             for (int i = 0; i < nThreads; i++) {
                 final int shardIdx = i;
                 final Callable<AggregateResult> callable = () -> {
-                    Tracing.recordWorkStart("shard", shardIdx);
+                    Tracing.recordWorkStart("Shard", shardIdx);
                     AggregateResult result = new ShardProcessor(shardQueue, hashtableSizeBits, shardIdx).processShard();
-                    Tracing.recordWorkEnd("shard", shardIdx);
+                    Tracing.recordWorkEnd("Shard", shardIdx);
                     return result;
                 };
                 results.add(executorService.submit(callable));
             }
             Tracing.recordEvent("Basic push time");
 
-            AggregateResult result = executorService.submit(() -> merge(results)).get();
+            // This particular sequence of Futures is so that both merge and munmap() can work as shards
+            // finish their computation without blocking on the entire set of shards to complete. In
+            // particular, munmap() doesn't need to wait on merge.
+            // First, submit a task to merge the results and then submit a task to cleanup bytebuffers
+            // from completed shards.
+            Future<AggregateResult> resultFutures = executorService.submit(() -> merge(results));
+            // Note that munmap() is serial and not parallel and hence we use just one thread.
+            executorService.submit(() -> closeByteBuffers(results, shardQueue));
 
+            AggregateResult result = resultFutures.get();
             Tracing.recordEvent("Merge results received");
 
-            // Note that munmap() is serial and not parallel
-            executorService.submit(
-                    () -> {
-                        for (int i = 0; i < nThreads; i++) {
-                            shardQueue.close(i);
-                        }
-                    });
-
-            Tracing.recordEvent("Waiting for executor shutdown");
-
+            Tracing.recordEvent("About to shutdown executor and wait");
             executorService.shutdown();
             executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
-
             Tracing.recordEvent("Executor terminated");
-            Tracing.analyzeWorkThreads("cleaner", nThreads);
-            Tracing.recordEvent("After cleaner finish printed");
 
+            Tracing.analyzeWorkThreads(nThreads);
             return result;
         }
 
+        private void closeByteBuffers(
+                                      List<Future<AggregateResult>> results, LazyShardQueue shardQueue) {
+            int n = results.size();
+            boolean[] isDone = new boolean[n];
+            int remaining = results.size();
+            while (remaining > 0) {
+                for (int i = 0; i < n; i++) {
+                    if (!isDone[i] && results.get(i).isDone()) {
+                        remaining--;
+                        isDone[i] = true;
+                        shardQueue.close("Ending Cleaner", i);
+                    }
+                }
+            }
+        }
+
         private AggregateResult merge(List<Future<AggregateResult>> results)
                 throws ExecutionException, InterruptedException {
             Tracing.recordEvent("Merge start time");
@@ -516,7 +559,6 @@ private AggregateResult merge(List<Future<AggregateResult>> results)
                 }
             }
             Tracing.recordEvent("Merge end time");
-            Tracing.analyzeWorkThreads("shard", results.size());
             return new AggregateResult(output);
         }
     }
@@ -532,6 +574,7 @@ private static long roundToNearestLowerMultipleOf(long divisor, long value) {
         private final long commonChunkSize;
         private final AtomicLong commonPool;
         private final long effectiveFileSize;
+        private final boolean fakeAdvance;
         private final long fileSize;
         private final long[] perThreadData;
         private final RandomAccessFile raf;
@@ -543,8 +586,11 @@ public SerialLazyShardQueue(
                                     int shards,
                                     double commonChunkFraction,
                                     int commonChunkSizeBits,
-                                    int fileTailReservedBytes)
+                                    int fileTailReservedBytes,
+                                    double munmapFraction,
+                                    boolean fakeAdvance)
                 throws IOException {
+            this.fakeAdvance = fakeAdvance;
             Checks.checkArg(commonChunkFraction < 0.9 && commonChunkFraction >= 0);
             Checks.checkArg(fileTailReservedBytes >= 0);
             this.raf = new RandomAccessFile(filePath.toFile(), "r");
@@ -580,8 +626,8 @@ public SerialLazyShardQueue(
                 // its work, where R = relative speed of unmap compared to the computation.
                 // For our problem, R ~ 75 because unmap unmaps 30GB/sec (but, it is serial) while
                 // cores go through data at the rate of 400MB/sec.
-                perThreadData[pos + 3] = (long) (currentChunks * (0.03 * (shards - i)));
-                perThreadData[pos + 4] = 1;
+                perThreadData[pos + 3] = (long) (currentChunks * (munmapFraction * (shards - i)));
+                perThreadData[pos + 4] = 1; // true iff munmap() hasn't been triggered yet
                 currentStart += currentChunks * chunkSize;
                 remainingChunks -= currentChunks;
             }
@@ -596,8 +642,8 @@ public SerialLazyShardQueue(
         }
 
         @Override
-        public void close(int shardIdx) {
-            byteRanges[shardIdx << 4].close(shardIdx);
+        public void close(String closerId, int shardIdx) {
+            byteRanges[shardIdx << 4].close(closerId, shardIdx);
         }
 
         @Override
@@ -616,14 +662,18 @@ public Optional<ByteRange> fileTailEndWork(int idx) {
         public ByteRange take(int shardIdx) {
             // Try for thread local range
             final int pos = shardIdx << 4;
-            long rangeStart = perThreadData[pos];
-            final long chunkEnd = perThreadData[pos + 1];
+            final long rangeStart;
             final long rangeEnd;
 
-            if (rangeStart < chunkEnd) {
+            if (perThreadData[pos + 2] >= 1) {
+                rangeStart = perThreadData[pos];
                 rangeEnd = rangeStart + chunkSize;
-                perThreadData[pos] = rangeEnd;
+                // Don't do this in the if-check; it causes negative values that trigger intermediate
+                // cleanup
                 perThreadData[pos + 2]--;
+                if (!fakeAdvance) {
+                    perThreadData[pos] = rangeEnd;
+                }
             }
             else {
                 rangeStart = commonPool.getAndAdd(commonChunkSize);
@@ -634,8 +684,8 @@ public ByteRange take(int shardIdx) {
                 rangeEnd = rangeStart + commonChunkSize;
             }
 
-            if (perThreadData[pos + 2] <= perThreadData[pos + 3] && perThreadData[pos + 4] > 0) {
-                if (attemptClose(shardIdx)) {
+            if (perThreadData[pos + 2] < perThreadData[pos + 3] && perThreadData[pos + 4] > 0) {
+                if (attemptIntermediateClose(shardIdx)) {
                     perThreadData[pos + 4]--;
                 }
             }
@@ -645,9 +695,9 @@ public ByteRange take(int shardIdx) {
             return chunk;
         }
 
-        private boolean attemptClose(int shardIdx) {
+        private boolean attemptIntermediateClose(int shardIdx) {
             if (seqLock.acquire()) {
-                byteRanges[shardIdx << 4].close(shardIdx);
+                close("Intermediate Cleaner", shardIdx);
                 seqLock.release();
                 return true;
             }
@@ -964,12 +1014,22 @@ public String toString() {
 
     static class Tracing {
 
-        private static final long[] cleanerTimes = new long[1 << 6 << 1];
-        private static final long[] threadTimes = new long[1 << 6 << 1];
+        private static final Map<String, ThreadTimingsArray> knownWorkThreadEvents;
         private static long startTime;
 
-        static void analyzeWorkThreads(String id, int nThreads) {
-            printTimingsAnalysis(id + " Stats", nThreads, timingsArray(id));
+        static {
+            // Maintain the ordering to be chronological in execution
+            // Map.of(..) screws up ordering
+            knownWorkThreadEvents = new LinkedHashMap<>();
+            for (String id : List.of("Shard", "Intermediate Cleaner", "Ending Cleaner")) {
+                knownWorkThreadEvents.put(id, new ThreadTimingsArray(id, 1 << 6 << 1));
+            }
+        }
+
+        static void analyzeWorkThreads(int nThreads) {
+            for (ThreadTimingsArray array : knownWorkThreadEvents.values()) {
+                errPrint(array.analyze(nThreads));
+            }
         }
 
         static void recordAppStart() {
@@ -981,11 +1041,11 @@ static void recordEvent(String event) {
         }
 
         static void recordWorkEnd(String id, int threadId) {
-            timingsArray(id)[2 * threadId + 1] = System.nanoTime();
+            knownWorkThreadEvents.get(id).recordEnd(threadId);
         }
 
         static void recordWorkStart(String id, int threadId) {
-            timingsArray(id)[2 * threadId] = System.nanoTime();
+            knownWorkThreadEvents.get(id).recordStart(threadId);
         }
 
         /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -998,57 +1058,78 @@ private static void printEvent(String message, long nanoTime) {
       errPrint(STR."\{message} = \{(nanoTime - startTime) / 1_000_000}ms");
     }
 
-    private static void printTimingsAnalysis(String header, int nThreads, long[] timestamps) {
-      long minDuration = Long.MAX_VALUE, maxDuration = Long.MIN_VALUE;
-      long minBegin = Long.MAX_VALUE, maxCompletion = Long.MIN_VALUE;
-      long maxBegin = Long.MIN_VALUE, minCompletion = Long.MAX_VALUE;
+        public static class ThreadTimingsArray {
 
-      long[] durationsMs = new long[nThreads];
-      long[] completionsMs = new long[nThreads];
-      long[] beginMs = new long[nThreads];
-      for (int i = 0; i < nThreads; i++) {
-        long durationNs = timestamps[2 * i + 1] - timestamps[2 * i];
-        durationsMs[i] = durationNs / 1_000_000;
-        completionsMs[i] = (timestamps[2 * i + 1] - startTime) / 1_000_000;
-        beginMs[i] = (timestamps[2 * i] - startTime) / 1_000_000;
+            private static String toString(long[] array) {
+                return Arrays.stream(array)
+                        .map(x -> x < 0 ? -1 : x)
+                        .mapToObj(x -> String.format("%6d", x))
+                        .collect(Collectors.joining(", ", "[ ", " ]"));
+            }
 
-        minDuration = Math.min(minDuration, durationNs);
-        maxDuration = Math.max(maxDuration, durationNs);
+            private final String id;
+            private final long[] timestamps;
+            private boolean hasData = false;
 
-        minBegin = Math.min(minBegin, timestamps[2 * i]);
-        maxBegin = Math.max(maxBegin, timestamps[2 * i]);
+            public ThreadTimingsArray(String id, int maxSize) {
+                this.timestamps = new long[maxSize];
+                this.id = id;
+            }
 
-        maxCompletion = Math.max(maxCompletion, timestamps[2 * i + 1]);
-        minCompletion = Math.min(minCompletion, timestamps[2 * i + 1]);
-      }
-      errPrint(
-          STR."""
+      public String analyze(int nThreads) {
+        if (!hasData) {
+          return "%s has no thread timings data".formatted(id);
+        }
+        Checks.checkArg(nThreads <= timestamps.length);
+        long minDuration = Long.MAX_VALUE, maxDuration = Long.MIN_VALUE;
+        long minBegin = Long.MAX_VALUE, maxCompletion = Long.MIN_VALUE;
+        long maxBegin = Long.MIN_VALUE, minCompletion = Long.MAX_VALUE;
+
+        long[] durationsMs = new long[nThreads];
+        long[] completionsMs = new long[nThreads];
+        long[] beginMs = new long[nThreads];
+        for (int i = 0; i < nThreads; i++) {
+          long durationNs = timestamps[2 * i + 1] - timestamps[2 * i];
+          durationsMs[i] = durationNs / 1_000_000;
+          completionsMs[i] = (timestamps[2 * i + 1] - startTime) / 1_000_000;
+          beginMs[i] = (timestamps[2 * i] - startTime) / 1_000_000;
+
+          minDuration = Math.min(minDuration, durationNs);
+          maxDuration = Math.max(maxDuration, durationNs);
+
+          minBegin = Math.min(minBegin, timestamps[2 * i] - startTime);
+          maxBegin = Math.max(maxBegin, timestamps[2 * i] - startTime);
+
+          maxCompletion = Math.max(maxCompletion, timestamps[2 * i + 1] - startTime);
+          minCompletion = Math.min(minCompletion, timestamps[2 * i + 1] - startTime);
+        }
+        return STR."""
         -------------------------------------------------------------------------------------------
-                                       \{header}
+                                       \{id} Stats
         -------------------------------------------------------------------------------------------
         Max duration                              = \{maxDuration / 1_000_000} ms
         Min duration                              = \{minDuration / 1_000_000} ms
-        Timespan[max(end)-min(start)]             = \{(maxCompletion - minBegin) / 1_000_000} ms
+        Timespan[max(end)-min(start)]             = \{(maxCompletion - minBegin) / 1_000_000} ms [\{maxCompletion / 1_000_000} - \{minBegin / 1_000_000} ]
         Completion Timespan[max(end)-min(end)]    = \{(maxCompletion - minCompletion) / 1_000_000} ms
         Begin Timespan[max(begin)-min(begin)]     = \{(maxBegin - minBegin) / 1_000_000} ms
-        Durations                                 = \{toString(durationsMs)} in ms
-        Begin Timestamps                          = \{toString(beginMs)} in ms
-        Completion Timestamps                     = \{toString(completionsMs)} in ms
-        """);
-    }
+        Average Duration                          = \{Arrays.stream(durationsMs)
+                                                            .average()
+                                                            .getAsDouble()} ms
+        Durations                                 = \{toString(durationsMs)} ms
+        Begin Timestamps                          = \{toString(beginMs)} ms
+        Completion Timestamps                     = \{toString(completionsMs)} ms
+        """;
+      }
 
-        private static long[] timingsArray(String id) {
-            return switch (id) {
-                case "cleaner" -> cleanerTimes;
-                case "shard" -> threadTimes;
-                default -> throw new RuntimeException("");
-            };
-        }
+            public void recordEnd(int idx) {
+                timestamps[2 * idx + 1] = System.nanoTime();
+                hasData = true;
+            }
 
-        private static String toString(long[] array) {
-            return Arrays.stream(array)
-                    .mapToObj(x -> String.format("%6d", x))
-                    .collect(Collectors.joining(", ", "[ ", " ]"));
+            public void recordStart(int idx) {
+                timestamps[2 * idx] = System.nanoTime();
+                hasData = true;
+            }
         }
     }
 }

From 586def36200772d11523c2a697701f9371896a44 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Fri, 19 Jan 2024 21:52:55 +0100
Subject: [PATCH 068/268] plain old io (#492)

plain old io
---
 calculate_average_artsiomkorzun.sh            |   4 +-
 .../CalculateAverage_artsiomkorzun.java       | 110 +++++++++---------
 2 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/calculate_average_artsiomkorzun.sh b/calculate_average_artsiomkorzun.sh
index d9c18284e..977b6e320 100755
--- a/calculate_average_artsiomkorzun.sh
+++ b/calculate_average_artsiomkorzun.sh
@@ -17,9 +17,9 @@
 
 if [ -f target/CalculateAverage_artsiomkorzun_image ]; then
     echo "Picking up existing native image 'target/CalculateAverage_artsiomkorzun_image', delete the file to select JVM mode." 1>&2
-    target/CalculateAverage_artsiomkorzun_image
+    target/CalculateAverage_artsiomkorzun_image -XX:MaxDirectMemorySize=4294967296
 else
-    JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation"
+    JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation -XX:MaxDirectMemorySize=4294967296"
     echo "Chosing to run the app in JVM mode as no native image was found, use prepare_artsiomkorzun.sh to generate." 1>&2
     java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_artsiomkorzun
 fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index c3c39ab3f..13731546f 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -17,12 +17,12 @@
 
 import sun.misc.Unsafe;
 
-import java.lang.foreign.Arena;
-import java.lang.foreign.MemorySegment;
 import java.lang.reflect.Field;
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.StandardOpenOption;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -31,19 +31,21 @@
 public class CalculateAverage_artsiomkorzun {
 
     private static final Path FILE = Path.of("./measurements.txt");
-    private static final long SEGMENT_SIZE = 32 * 1024 * 1024;
-    private static final long SEGMENT_OVERLAP = 1024;
+    private static final int SEGMENT_SIZE = 4 * 1024 * 1024;
+    private static final int SEGMENT_OVERLAP = 128;
     private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
 
     private static final Unsafe UNSAFE;
+    private static final long ADDRESS_OFFSET;
 
     static {
         try {
             Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
             unsafe.setAccessible(true);
             UNSAFE = (Unsafe) unsafe.get(Unsafe.class);
+            ADDRESS_OFFSET = UNSAFE.objectFieldOffset(Buffer.class.getDeclaredField("address"));
         }
         catch (Throwable e) {
             throw new RuntimeException(e);
@@ -62,9 +64,7 @@ public static void main(String[] args) throws Exception {
     }
 
     private static void execute() throws Exception {
-        MemorySegment fileMemory = map(FILE);
-        long fileAddress = fileMemory.address();
-        long fileSize = fileMemory.byteSize();
+        long fileSize = Files.size(FILE);
         int segmentCount = (int) ((fileSize + SEGMENT_SIZE - 1) / SEGMENT_SIZE);
 
         AtomicInteger counter = new AtomicInteger();
@@ -74,7 +74,7 @@ private static void execute() throws Exception {
         Aggregator[] aggregators = new Aggregator[parallelism];
 
         for (int i = 0; i < aggregators.length; i++) {
-            aggregators[i] = new Aggregator(counter, result, fileAddress, fileSize, segmentCount);
+            aggregators[i] = new Aggregator(counter, result, segmentCount);
             aggregators[i].start();
         }
 
@@ -86,14 +86,16 @@ private static void execute() throws Exception {
         System.out.println(text(aggregates));
     }
 
-    private static MemorySegment map(Path file) {
-        try (FileChannel channel = FileChannel.open(file, StandardOpenOption.READ)) {
-            long size = channel.size();
-            return channel.map(FileChannel.MapMode.READ_ONLY, 0, size, Arena.global());
-        }
-        catch (Throwable e) {
-            throw new RuntimeException(e);
-        }
+    private static long address(ByteBuffer buffer) {
+        return UNSAFE.getLong(buffer, ADDRESS_OFFSET);
+    }
+
+    private static ByteBuffer allocate(int size) {
+        ByteBuffer buffer = ByteBuffer.allocateDirect(size + 4096);
+        long address = address(buffer);
+        long aligned = (address + 4095) & (~4095);
+        int padding = (int) (aligned - address);
+        return buffer.position(padding).limit(padding + size).slice();
     }
 
     private static long word(long address) {
@@ -139,13 +141,8 @@ private static class Aggregates {
         private static final int ENTRIES = 64 * 1024;
         private static final int SIZE = 128 * ENTRIES;
 
-        private final long pointer;
-
-        public Aggregates() {
-            long address = UNSAFE.allocateMemory(SIZE + 8096);
-            pointer = (address + 4095) & (~4095);
-            UNSAFE.setMemory(pointer, SIZE, (byte) 0);
-        }
+        private final ByteBuffer buffer = allocate(SIZE);
+        private final long pointer = address(buffer);
 
         public long find(long word, int hash) {
             long address = pointer + offset(hash);
@@ -206,14 +203,8 @@ public void merge(Aggregates rights) {
 
                 for (int offset = offset(hash);; offset = next(offset)) {
                     long address = pointer + offset;
-                    int len = UNSAFE.getInt(address);
-
-                    if (len == 0) {
-                        UNSAFE.copyMemory(rightAddress, address, 24 + length);
-                        break;
-                    }
 
-                    if (len == length && equal(address + 24, rightAddress + 24, length)) {
+                    if (equal(address + 24, rightAddress + 24, length)) {
                         long sum = UNSAFE.getLong(address + 8) + UNSAFE.getLong(rightAddress + 8);
                         int cnt = UNSAFE.getInt(address + 16) + UNSAFE.getInt(rightAddress + 16);
                         short min = (short) Math.min(UNSAFE.getShort(address + 20), UNSAFE.getShort(rightAddress + 20));
@@ -225,6 +216,13 @@ public void merge(Aggregates rights) {
                         UNSAFE.putShort(address + 22, max);
                         break;
                     }
+
+                    int len = UNSAFE.getInt(address);
+
+                    if (len == 0) {
+                        UNSAFE.copyMemory(rightAddress, address, length + 24);
+                        break;
+                    }
                 }
             }
         }
@@ -237,8 +235,8 @@ public Map<String, Aggregate> aggregate() {
                 int length = UNSAFE.getInt(address);
 
                 if (length != 0) {
-                    byte[] array = new byte[length];
-                    UNSAFE.copyMemory(null, address + 24, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, length);
+                    byte[] array = new byte[length - 1];
+                    UNSAFE.copyMemory(null, address + 24, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, array.length);
                     String key = new String(array);
 
                     long sum = UNSAFE.getLong(address + 8);
@@ -271,7 +269,7 @@ private static int next(int prev) {
         }
 
         private static boolean equal(long leftAddress, long leftWord, long rightAddress, int length) {
-            while (length >= 8) {
+            while (length > 8) {
                 long left = UNSAFE.getLong(leftAddress);
                 long right = UNSAFE.getLong(rightAddress);
 
@@ -309,35 +307,39 @@ private static class Aggregator extends Thread {
 
         private final AtomicInteger counter;
         private final AtomicReference<Aggregates> result;
-        private final long fileAddress;
-        private final long fileSize;
-        private final int segmentCount;
+        private final int segments;
 
-        public Aggregator(AtomicInteger counter, AtomicReference<Aggregates> result,
-                          long fileAddress, long fileSize, int segmentCount) {
+        public Aggregator(AtomicInteger counter, AtomicReference<Aggregates> result, int segments) {
             super("aggregator");
             this.counter = counter;
             this.result = result;
-            this.fileAddress = fileAddress;
-            this.fileSize = fileSize;
-            this.segmentCount = segmentCount;
+            this.segments = segments;
         }
 
         @Override
         public void run() {
             Aggregates aggregates = new Aggregates();
+            ByteBuffer buffer = allocate(SEGMENT_SIZE + SEGMENT_OVERLAP);
 
-            for (int segment; (segment = counter.getAndIncrement()) < segmentCount;) {
-                long position = SEGMENT_SIZE * segment;
-                long size = Math.min(SEGMENT_SIZE + SEGMENT_OVERLAP, fileSize - position);
-                long address = fileAddress + position;
-                long limit = address + Math.min(SEGMENT_SIZE, size - 1);
+            try (FileChannel channel = FileChannel.open(FILE)) {
+                for (int segment; (segment = counter.getAndIncrement()) < segments;) {
+                    buffer.clear();
 
-                if (segment > 0) {
-                    address = next(address);
-                }
+                    long position = (long) SEGMENT_SIZE * segment;
+                    int size = channel.read(buffer, position);
+
+                    long address = address(buffer);
+                    long limit = address + Math.min(SEGMENT_SIZE, size - 1);
+
+                    if (segment > 0) {
+                        address = next(address);
+                    }
 
-                aggregate(aggregates, address, limit);
+                    aggregate(aggregates, address, limit);
+                }
+            }
+            catch (Throwable e) {
+                throw new RuntimeException(e);
             }
 
             while (!result.compareAndSet(null, aggregates)) {
@@ -406,7 +408,7 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                     ptr = aggregates.put(position, word, length, hash);
                 }
 
-                position = update(ptr, position + length + 1);
+                position = update(ptr, position + length);
             }
         }
 
@@ -431,12 +433,12 @@ private static long separator(long word) {
         }
 
         private static long mask(long word, long separator) {
-            long mask = ((separator - 1) ^ separator) >>> 8;
+            long mask = separator ^ (separator - 1);
             return word & mask;
         }
 
         private static int length(long separator) {
-            return Long.numberOfTrailingZeros(separator) >>> 3;
+            return (Long.numberOfTrailingZeros(separator) >>> 3) + 1;
         }
 
         private static long next(long position) {

From e67920f4af13215710cbc43098a051ef517c3fb0 Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Sat, 20 Jan 2024 06:03:51 +0900
Subject: [PATCH 069/268] low collision + fast mixer, more optimization, less
 if because if is slow (#474)

---
 .../onebrc/CalculateAverage_abeobk.java       | 102 ++++++++----------
 1 file changed, 44 insertions(+), 58 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index ec6c9e5ba..cdc2c1e38 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -60,14 +60,15 @@ private static Unsafe initUnsafe() {
 
     static class Node {
         long addr;
+        long word0;
         long tail;
+        int keylen;
         int min, max;
         int count;
         long sum;
 
         String key() {
             byte[] sbuf = new byte[MAX_STR_LEN];
-            int keylen = (int) (tail >>> 56);
             UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
             return new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
         }
@@ -76,18 +77,29 @@ public String toString() {
             return String.format("%.1f/%.1f/%.1f", min * 0.1, sum * 0.1 / count, max * 0.1);
         }
 
-        Node(long a, long t, int val) {
+        Node(long a, long t, int val, int kl) {
             addr = a;
             tail = t;
+            keylen = kl;
             sum = min = max = val;
             count = 1;
         }
 
+        Node(long a, long t, int val, int kl, long w0) {
+            this(a, t, val, kl);
+            word0 = w0;
+        }
+
         void add(int val) {
-            min = Math.min(min, val);
-            max = Math.max(max, val);
             sum += val;
             count++;
+            if (val >= max) {
+                max = val;
+                return;
+            }
+            if (val < min) {
+                min = val;
+            }
         }
 
         void merge(Node other) {
@@ -102,7 +114,7 @@ boolean contentEquals(long other_addr, long other_tail) {
                 return false;
             // this is faster than comparision if key is short
             long xsum = 0;
-            int n = ((int) (tail >>> 56)) & 0xF8;
+            int n = keylen & 0xF8;
             for (int i = 0; i < n; i += 8) {
                 xsum |= (UNSAFE.getLong(addr + i) ^ UNSAFE.getLong(other_addr + i));
             }
@@ -130,21 +142,13 @@ static final long getSemiPosCode(final long word) {
         return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
     }
 
-    // very low collision mixer
-    // idea from https://github.com/Cyan4973/xxHash/tree/dev
-    // zero collision on test data
+    // speed/collision balance
     static final int xxh32(long hash) {
         final int p1 = 0x85EBCA77; // prime
-        final int p2 = 0x165667B1; // prime
         int low = (int) hash;
-        int high = (int) (hash >>> 31);
-        int h = low + high;
-        h ^= h >> 15;
-        h *= p1;
-        h ^= h >> 13;
-        h *= p2;
-        h ^= h >> 11;
-        return h;
+        int high = (int) (hash >>> 33);
+        int h = (low * p1) ^ high;
+        return h ^ (h >>> 17);
     }
 
     // great idea from merykitty (Quan Anh Mai)
@@ -172,26 +176,23 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
             int val = 0;
             int bucket = 0;
 
-            long word = UNSAFE.getLong(addr);
-            long semipos_code = getSemiPosCode(word);
+            long word0 = UNSAFE.getLong(addr);
+            long semipos_code = getSemiPosCode(word0);
 
             // about 50% chance key < 8 chars
             if (semipos_code != 0) {
                 int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
                 addr += semi_pos;
-                tail = (word & HASH_MASKS[semi_pos]);
+                tail = (word0 & HASH_MASKS[semi_pos]);
                 bucket = xxh32(tail) & BUCKET_MASK;
-                long keylen = (addr - row_addr);
-                tail |= (keylen << 56);
                 long num_word = UNSAFE.getLong(++addr);
                 int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 val = parseNum(num_word, dot_pos);
                 addr += (dot_pos >>> 3) + 3;
-
                 while (true) {
                     var node = map[bucket];
                     if (node == null) {
-                        map[bucket] = new Node(row_addr, tail, val);
+                        map[bucket] = new Node(row_addr, tail, val, semi_pos);
                         break;
                     }
                     if (node.tail == tail) {
@@ -205,26 +206,30 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
                 continue;
             }
 
-            hash ^= word;
+            hash ^= word0;
             addr += 8;
-            word = UNSAFE.getLong(addr);
+            long word = UNSAFE.getLong(addr);
             semipos_code = getSemiPosCode(word);
-            // frist byte semicolon ~13%
-            if (semipos_code == 0x80) {
+            // 43% chance
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                addr += semi_pos;
+                tail = (word & HASH_MASKS[semi_pos]);
+                hash ^= tail;
                 bucket = xxh32(hash) & BUCKET_MASK;
-                tail = 8L << 56;
-                long num_word = word >>> 8;
+                int keylen = (int) (addr - row_addr);
+                long num_word = UNSAFE.getLong(++addr);
                 int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 val = parseNum(num_word, dot_pos);
-                addr += (dot_pos >>> 3) + 4;
+                addr += (dot_pos >>> 3) + 3;
 
                 while (true) {
                     var node = map[bucket];
                     if (node == null) {
-                        map[bucket] = new Node(row_addr, tail, val);
+                        map[bucket] = new Node(row_addr, tail, val, keylen, word0);
                         break;
                     }
-                    if (UNSAFE.getLong(node.addr) == UNSAFE.getLong(row_addr)) {
+                    if (node.word0 == word0 && node.tail == tail) {
                         node.add(val);
                         break;
                     }
@@ -247,38 +252,18 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
             tail = (word & HASH_MASKS[semi_pos]);
             hash ^= tail;
             bucket = xxh32(hash) & BUCKET_MASK;
-            long keylen = (addr - row_addr);
-            tail |= (keylen << 56);
+            int keylen = (int) (addr - row_addr);
+
+            long num_word = UNSAFE.getLong(++addr);
 
-            ++addr;
-            long num_word = UNSAFE.getLong(addr);
             int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
             val = parseNum(num_word, dot_pos);
             addr += (dot_pos >>> 3) + 3;
 
-            if (keylen < 16) {
-                while (true) {
-                    var node = map[bucket];
-                    if (node == null) {
-                        map[bucket] = new Node(row_addr, tail, val);
-                        break;
-                    }
-                    if (node.tail == tail && (UNSAFE.getLong(node.addr) == UNSAFE.getLong(row_addr))) {
-                        node.add(val);
-                        break;
-                    }
-                    bucket++;
-                    if (SHOW_ANALYSIS)
-                        cls[thread_id]++;
-                }
-                continue;
-            }
-
-            // longer key
             while (true) {
                 var node = map[bucket];
                 if (node == null) {
-                    map[bucket] = new Node(row_addr, tail, val);
+                    map[bucket] = new Node(row_addr, tail, val, keylen);
                     break;
                 }
                 if (node.contentEquals(row_addr, tail)) {
@@ -335,7 +320,7 @@ public static void main(String[] args) throws InterruptedException, IOException
                     if (node == null)
                         continue;
                     if (SHOW_ANALYSIS) {
-                        int kl = (int) (node.tail >>> 56) & (lenhist.length - 1);
+                        int kl = node.keylen & (lenhist.length - 1);
                         lenhist[kl] += node.count;
                     }
                     var stat = ms.putIfAbsent(node.key(), node);
@@ -353,4 +338,5 @@ public static void main(String[] args) throws InterruptedException, IOException
                 System.out.println(ms);
         }
     }
+
 }
\ No newline at end of file

From f409fe0815c18e0d79bc161b1f8d3baeb2ad5771 Mon Sep 17 00:00:00 2001
From: Juan Parera <1420988+jparera@users.noreply.github.com>
Date: Fri, 19 Jan 2024 22:06:48 +0100
Subject: [PATCH 070/268] Change data storage improving memory locality (#496)

---
 .../onebrc/CalculateAverage_jparera.java      | 231 ++++++++++--------
 1 file changed, 131 insertions(+), 100 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java
index 13252550a..194dbccec 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java
@@ -1,3 +1,5 @@
+//COMPILE_OPTIONS -source 21 --enable-preview --add-modules jdk.incubator.vector
+//RUNTIME_OPTIONS --enable-preview --add-modules jdk.incubator.vector
 /*
  *  Copyright 2023 The original authors
  *
@@ -19,6 +21,8 @@
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.VarHandle;
 import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
 import java.nio.channels.FileChannel.MapMode;
@@ -26,7 +30,6 @@
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.List;
 import java.util.TreeMap;
 import java.util.function.Function;
@@ -34,25 +37,41 @@
 
 import jdk.incubator.vector.ByteVector;
 import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorOperators;
 
 public class CalculateAverage_jparera {
     private static final String FILE = "./measurements.txt";
 
-    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
+    private static final VarHandle BYTE_HANDLE = MethodHandles
+            .memorySegmentViewVarHandle(ValueLayout.JAVA_BYTE);
+
+    private static final VarHandle INT_HANDLE = MethodHandles
+            .memorySegmentViewVarHandle(ValueLayout.JAVA_INT_UNALIGNED);
 
-    private static final int BYTE_SPECIES_SIZE = BYTE_SPECIES.vectorByteSize();
+    private static final VarHandle LONG_LE_HANDLE = MethodHandles
+            .memorySegmentViewVarHandle(ValueLayout.JAVA_LONG_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN));
+
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
 
     private static final int BYTE_SPECIES_LANES = BYTE_SPECIES.length();
 
-    private static final ValueLayout.OfLong LONG_U_LE = ValueLayout.JAVA_LONG_UNALIGNED
-            .withOrder(ByteOrder.LITTLE_ENDIAN);
+    private static final ByteOrder NATIVE_ORDER = ByteOrder.nativeOrder();
+
+    private static final byte LF = '\n';
 
-    public static void main(String[] args) throws IOException {
+    private static final byte SEPARATOR = ';';
+
+    private static final byte DECIMAL_SEPARATOR = '.';
+
+    private static final byte NEG = '-';
+
+    public static void main(String[] args) throws IOException, InterruptedException {
         try (var fc = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
             try (var arena = Arena.ofShared()) {
                 var fs = fc.map(MapMode.READ_ONLY, 0, fc.size(), arena);
-                var map = chunks(fs)
-                        .parallelStream()
+                var cpus = Runtime.getRuntime().availableProcessors();
+                var output = chunks(fs, cpus).stream()
+                        .parallel()
                         .map(Chunk::parse)
                         .flatMap(List::stream)
                         .collect(Collectors.toMap(
@@ -60,20 +79,19 @@ public static void main(String[] args) throws IOException {
                                 Function.identity(),
                                 Entry::merge,
                                 TreeMap::new));
-                System.out.println(map);
+                System.out.println(output);
             }
         }
     }
 
-    private static Collection<Chunk> chunks(MemorySegment ms) {
-        var cpus = Runtime.getRuntime().availableProcessors();
-        long expectedChunkSize = Math.ceilDiv(ms.byteSize(), cpus);
-        var chunks = new ArrayList<Chunk>();
+    private static List<Chunk> chunks(MemorySegment ms, int splits) {
         long fileSize = ms.byteSize();
+        long expectedChunkSize = Math.ceilDiv(fileSize, splits);
+        var chunks = new ArrayList<Chunk>();
         long offset = 0;
         while (offset < fileSize) {
             var end = Math.min(offset + expectedChunkSize, fileSize);
-            while (end < fileSize && ms.get(ValueLayout.JAVA_BYTE, end++) != '\n') {
+            while (end < fileSize && (byte) BYTE_HANDLE.get(ms, end++) != LF) {
             }
             long len = end - offset;
             chunks.add(new Chunk(ms.asSlice(offset, len)));
@@ -83,25 +101,27 @@ private static Collection<Chunk> chunks(MemorySegment ms) {
     }
 
     private static final class Chunk {
-        private static final byte SEPARATOR = ';';
+        private static final int KEY_LOG2_BYTES = 7;
 
-        private static final byte DECIMAL_SEPARATOR = '.';
+        private static final int KEY_BYTES = 1 << KEY_LOG2_BYTES;
 
-        private static final byte LF = '\n';
+        private static final int ENTRIES_LOG2_CAPACITY = 16;
 
-        private static final byte MINUS = '-';
+        private static final int ENTRIES_CAPACITY = 1 << ENTRIES_LOG2_CAPACITY;
 
-        private static final int KEY_LOG2_BYTES = 7;
+        private static final int ENTRIES_MASK = ENTRIES_CAPACITY - 1;
 
-        private static final int KEY_BYTES = 1 << KEY_LOG2_BYTES;
+        private final MemorySegment segment;
 
-        private static final int MAP_CAPACITY = 1 << 16;
+        private final long size;
 
-        private static final int BUCKET_MASK = MAP_CAPACITY - 1;
+        private final Entry[] entries = new Entry[ENTRIES_CAPACITY];
 
-        private final MemorySegment segment;
+        private final byte[] keys = new byte[ENTRIES_CAPACITY * KEY_BYTES];
+
+        private final MemorySegment kms = MemorySegment.ofArray(this.keys);
 
-        private final Entry[] entries = new Entry[MAP_CAPACITY];
+        private static final int KEYS_MASK = (ENTRIES_CAPACITY * KEY_BYTES) - 1;
 
         private long offset;
 
@@ -111,26 +131,23 @@ private static final class Chunk {
 
         Chunk(MemorySegment segment) {
             this.segment = segment;
+            this.size = segment.byteSize();
         }
 
         public List<Entry> parse() {
-            long size = this.segment.byteSize();
             long safe = size - KEY_BYTES;
             while (offset < safe) {
-                var e = vectorizedEntry();
-                int value = vectorizedValue();
-                e.add(value);
+                vectorizedEntry().add(vectorizedValue());
             }
             next();
             while (hasCurrent()) {
-                var e = entry();
-                int value = value();
-                e.add(value);
+                entry().add(value());
             }
             var output = new ArrayList<Entry>(entries.length);
-            for (int i = 0; i < entries.length; i++) {
+            for (int i = 0, o = 0; i < entries.length; i++, o += KEY_BYTES) {
                 var e = entries[i];
                 if (e != null) {
+                    e.setkey(keys, o);
                     output.add(e);
                 }
             }
@@ -138,29 +155,48 @@ public List<Entry> parse() {
         }
 
         private Entry vectorizedEntry() {
-            var start = this.offset;
-            var first = ByteVector.fromMemorySegment(BYTE_SPECIES, this.segment, start, ByteOrder.nativeOrder());
-            int equals = first.eq(SEPARATOR).firstTrue();
-            int len = equals;
-            for (int i = BYTE_SPECIES_SIZE; equals == BYTE_SPECIES_LANES; i += BYTE_SPECIES_SIZE) {
-                var next = ByteVector.fromMemorySegment(BYTE_SPECIES, this.segment, start + i, ByteOrder.nativeOrder());
-                equals = next.eq(SEPARATOR).firstTrue();
+            var separators = ByteVector.broadcast(BYTE_SPECIES, SEPARATOR);
+            int len = 0;
+            for (int i = 0;; i += BYTE_SPECIES_LANES) {
+                var block = ByteVector.fromMemorySegment(BYTE_SPECIES, this.segment, offset + i, NATIVE_ORDER);
+                int equals = block.compare(VectorOperators.EQ, separators).firstTrue();
                 len += equals;
+                if (equals != BYTE_SPECIES_LANES) {
+                    break;
+                }
             }
+            var start = this.offset;
             this.offset = start + len + 1;
-            int index = hash(this.segment, start, len);
+            int hash = hash(segment, start, len);
+            int index = (hash - (hash >>> -ENTRIES_LOG2_CAPACITY)) & ENTRIES_MASK;
+            int keyOffset = index << KEY_LOG2_BYTES;
             int count = 0;
-            while (count < BUCKET_MASK) {
-                index = index & BUCKET_MASK;
+            while (count < ENTRIES_MASK) {
+                index = index & ENTRIES_MASK;
+                keyOffset = keyOffset & KEYS_MASK;
                 var e = this.entries[index];
                 if (e == null) {
-                    return this.entries[index] = new Entry(len, this.segment.asSlice(start, KEY_BYTES));
+                    MemorySegment.copy(this.segment, start, kms, keyOffset, len);
+                    return this.entries[index] = new Entry(len, hash);
                 }
-                else if (e.keyLength() == len && vectorizedEquals(e, first, start, len)) {
-                    return e;
+                else if (e.hash == hash && e.keyLength == len) {
+                    int total = 0;
+                    for (int i = 0; i < KEY_BYTES; i += BYTE_SPECIES_LANES) {
+                        var ekey = ByteVector.fromArray(BYTE_SPECIES, keys, keyOffset + i);
+                        var okey = ByteVector.fromMemorySegment(BYTE_SPECIES, this.segment, start + i, NATIVE_ORDER);
+                        int equals = ekey.compare(VectorOperators.NE, okey).firstTrue();
+                        total += equals;
+                        if (equals != BYTE_SPECIES_LANES) {
+                            break;
+                        }
+                    }
+                    if (total >= len) {
+                        return e;
+                    }
                 }
-                index++;
                 count++;
+                index++;
+                keyOffset += KEY_BYTES;
             }
             throw new IllegalStateException("Map is full!");
         }
@@ -173,19 +209,33 @@ private Entry entry() {
                 next();
             }
             expect(SEPARATOR);
-            int index = hash(segment, start, len);
+            int hash = hash(segment, start, len);
+            int index = (hash - (hash >>> -ENTRIES_LOG2_CAPACITY)) & ENTRIES_MASK;
+            int keyOffset = index << KEY_LOG2_BYTES;
             int count = 0;
-            while (count < BUCKET_MASK) {
-                index = index & BUCKET_MASK;
+            while (count < ENTRIES_MASK) {
+                index = index & ENTRIES_MASK;
+                keyOffset = keyOffset & KEYS_MASK;
                 var e = this.entries[index];
                 if (e == null) {
-                    return this.entries[index] = new Entry(len, this.segment.asSlice(start, len));
+                    MemorySegment.copy(this.segment, start, kms, keyOffset, len);
+                    return this.entries[index] = new Entry(len, hash);
                 }
-                else if (e.keyLength() == len && equals(e, start, len)) {
-                    return e;
+                else if (e.hash == hash && e.keyLength == len) {
+                    int total = 0;
+                    for (int i = 0; i < len; i++) {
+                        if (((byte) BYTE_HANDLE.get(this.segment, start + i)) != this.keys[keyOffset + i]) {
+                            break;
+                        }
+                        total++;
+                    }
+                    if (total >= len) {
+                        return e;
+                    }
                 }
-                index++;
                 count++;
+                index++;
+                keyOffset += KEY_BYTES;
             }
             throw new IllegalStateException("Map is full!");
         }
@@ -193,9 +243,9 @@ else if (e.keyLength() == len && equals(e, start, len)) {
         private static final long MULTIPLY_ADD_DIGITS = 100 * (1L << 24) + 10 * (1L << 16) + 1;
 
         private int vectorizedValue() {
-            long dw = this.segment.get(LONG_U_LE, this.offset);
-            boolean negative = ((dw & 0xFF) ^ MINUS) == 0;
+            long dw = (long) LONG_LE_HANDLE.get(this.segment, this.offset);
             int zeros = Long.numberOfTrailingZeros(~dw & 0x10101000L);
+            boolean negative = ((dw & 0xFF) ^ NEG) == 0;
             dw = ((negative ? (dw & ~0xFF) : dw) << (28 - zeros)) & 0x0F000F0F00L;
             int value = (int) (((dw * MULTIPLY_ADD_DIGITS) >>> 32) & 0x3FF);
             this.offset += (zeros >>> 3) + 3;
@@ -205,7 +255,7 @@ private int vectorizedValue() {
         private int value() {
             int value = 0;
             var negative = false;
-            if (consume(MINUS)) {
+            if (consume(NEG)) {
                 negative = true;
             }
             while (hasCurrent()) {
@@ -224,41 +274,18 @@ else if (current != DECIMAL_SEPARATOR) {
             return negative ? -value : value;
         }
 
-        private boolean vectorizedEquals(Entry entry, ByteVector okey, long offset, int len) {
-            var ekey = ByteVector.fromMemorySegment(BYTE_SPECIES, entry.segment(), 0, ByteOrder.nativeOrder());
-            int equals = ekey.eq(okey).not().firstTrue();
-            if (equals != BYTE_SPECIES_LANES) {
-                return equals >= len;
-            }
-            long eo = BYTE_SPECIES_SIZE;
-            int total = BYTE_SPECIES_LANES;
-            while (equals == BYTE_SPECIES_LANES & eo < KEY_BYTES) {
-                offset += BYTE_SPECIES_SIZE;
-                ekey = ByteVector.fromMemorySegment(BYTE_SPECIES, entry.segment(), eo, ByteOrder.nativeOrder());
-                okey = ByteVector.fromMemorySegment(BYTE_SPECIES, segment, offset, ByteOrder.nativeOrder());
-                equals = ekey.eq(okey).not().firstTrue();
-                total += equals;
-                eo += BYTE_SPECIES_SIZE;
-            }
-            return total >= len;
-        }
-
-        private boolean equals(Entry entry, long offset, int len) {
-            return MemorySegment.mismatch(this.segment, offset, offset + len, entry.segment(), 0, len) == -1;
-        }
-
         private static final int GOLDEN_RATIO = 0x9E3779B9;
         private static final int HASH_LROTATE = 5;
 
         private static int hash(MemorySegment ms, long start, int len) {
             int x, y;
             if (len >= Integer.BYTES) {
-                x = ms.get(ValueLayout.JAVA_INT_UNALIGNED, start);
-                y = ms.get(ValueLayout.JAVA_INT_UNALIGNED, start + len - Integer.BYTES);
+                x = (int) INT_HANDLE.get(ms, start);
+                y = (int) INT_HANDLE.get(ms, start + len - Integer.BYTES);
             }
             else {
-                x = ms.get(ValueLayout.JAVA_BYTE, start);
-                y = ms.get(ValueLayout.JAVA_BYTE, start + len - Byte.BYTES);
+                x = (byte) BYTE_HANDLE.get(ms, start) & 0xFF;
+                y = (byte) BYTE_HANDLE.get(ms, start + len - Byte.BYTES) & 0xFF;
             }
             return (Integer.rotateLeft(x * GOLDEN_RATIO, HASH_LROTATE) ^ y) * GOLDEN_RATIO;
         }
@@ -282,8 +309,8 @@ private boolean hasCurrent() {
         }
 
         private void next() {
-            if (offset < segment.byteSize()) {
-                this.current = segment.get(ValueLayout.JAVA_BYTE, offset++);
+            if (offset < size) {
+                this.current = (byte) BYTE_HANDLE.get(segment, offset++);
             }
             else {
                 this.hasCurrent = false;
@@ -292,9 +319,9 @@ private void next() {
     }
 
     private static final class Entry {
-        private final int keyLength;
+        final int keyLength;
 
-        private final MemorySegment segment;
+        final int hash;
 
         private int min = Integer.MAX_VALUE;
 
@@ -304,21 +331,19 @@ private static final class Entry {
 
         private int count;
 
-        Entry(int keyLength, MemorySegment segment) {
-            this.keyLength = keyLength;
-            this.segment = segment;
-        }
+        private String key;
 
-        int keyLength() {
-            return keyLength;
+        Entry(int keyLength, int hash) {
+            this.keyLength = keyLength;
+            this.hash = hash;
         }
 
-        MemorySegment segment() {
-            return segment;
+        public String key() {
+            return key;
         }
 
-        public String key() {
-            return new String(segment.asSlice(0, keyLength).toArray(ValueLayout.JAVA_BYTE), StandardCharsets.UTF_8);
+        void setkey(byte[] keys, int offset) {
+            this.key = new String(keys, offset, keyLength, StandardCharsets.UTF_8);
         }
 
         public void add(int value) {
@@ -339,13 +364,19 @@ public Entry merge(Entry o) {
         @Override
         public String toString() {
             var average = Math.round(((sum / 10.0) / count) * 10.0);
-            return decimal(min) + "/" + decimal(average) + "/" + decimal(max);
+            return decimal(min) + '/' + decimal(average) + '/' + decimal(max);
         }
 
         private static String decimal(long value) {
-            boolean negative = value < 0;
+            var builder = new StringBuilder();
+            if (value < 0) {
+                builder.append((char) NEG);
+            }
             value = Math.abs(value);
-            return (negative ? "-" : "") + (value / 10) + "." + (value % 10);
+            builder.append(value / 10);
+            builder.append((char) DECIMAL_SEPARATOR);
+            builder.append(value % 10);
+            return builder.toString();
         }
     }
 }

From 7c983f3d66f3193948fc9e63c6d735685ca11a1d Mon Sep 17 00:00:00 2001
From: Roy van Rijn <roy.van.rijn@gmail.com>
Date: Fri, 19 Jan 2024 13:15:49 -0800
Subject: [PATCH 071/268] Added dedicated reader (#493)

Started running perf, perhaps this helps. No idea how to use it yet
---
 prepare_royvanrijn.sh                         |   2 +-
 .../onebrc/CalculateAverage_royvanrijn.java   | 668 ++++++++++++------
 2 files changed, 458 insertions(+), 212 deletions(-)

diff --git a/prepare_royvanrijn.sh b/prepare_royvanrijn.sh
index 2088b7b30..ba89535d7 100755
--- a/prepare_royvanrijn.sh
+++ b/prepare_royvanrijn.sh
@@ -22,7 +22,7 @@ sdk use java 21.0.1-graal 1>&2
 if [ ! -f target/CalculateAverage_royvanrijn_image ]; then
 
     JAVA_OPTS="--enable-preview -dsa"
-    NATIVE_IMAGE_OPTS="--gc=epsilon -Ob -O3 -march=native --strict-image-heap $JAVA_OPTS"
+    NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_royvanrijn --gc=epsilon -Ob -O3 -march=native --strict-image-heap $JAVA_OPTS"
 
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_royvanrijn_image dev.morling.onebrc.CalculateAverage_royvanrijn
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
index 307833f70..b392e5801 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
@@ -15,17 +15,15 @@
  */
 package dev.morling.onebrc;
 
-import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.stream.Collectors;
-import java.util.stream.IntStream;
 
 import sun.misc.Unsafe;
 
@@ -55,216 +53,500 @@
  * Remove writing to buffer:         1335 ms
  * Optimized collecting at the end:  1310 ms
  * Adding a lot of comments:         priceless
+ * Changed to flyweight byte[]:      1290 ms (adds even more Unsafe, was initially slower, now faster)
+ * More LOC now parallel:            1260 ms (moved more to processMemoryArea, recombining in ConcurrentHashMap)
+ * Storing only the address:         1240 ms (this is now faster, tried before, was slower)
+ * Unrolling scan-loop:              1200 ms (seems to help, perhaps even more on target machine)
+ * Adding more readable reader:      1300 ms (scores got worse on target machine anyway)
  *
- * Big thanks to Francesco Nigro, Thomas Wuerthinger, Quan Anh Mai for ideas.
+ * I've ditched my M2 for an older x86-64 MacBook, this allows me to run `perf` and I'm trying to get lower numbers by trail and error.
+ *
+ * Big thanks to Francesco Nigro, Thomas Wuerthinger, Quan Anh Mai and many others for ideas.
  *
  * Follow me at: @royvanrijn
  */
 public class CalculateAverage_royvanrijn {
 
     private static final String FILE = "./measurements.txt";
+    // private static final String FILE = "src/test/resources/samples/measurements-1.txt";
 
     private static final Unsafe UNSAFE = initUnsafe();
 
-    private static Unsafe initUnsafe() {
-        try {
-            final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
-            theUnsafe.setAccessible(true);
-            return (Unsafe) theUnsafe.get(Unsafe.class);
-        }
-        catch (NoSuchFieldException | IllegalAccessException e) {
-            throw new RuntimeException(e);
-        }
-    }
+    // Twice the processors, smoothens things out.
+    private static final int PROCESSORS = Runtime.getRuntime().availableProcessors();
+
+    /**
+     * Flyweight entry in a byte[], max 128 bytes.
+     *
+     * long: sum
+     * int:  min
+     * int:  max
+     * int:  count
+     * byte: length
+     * byte[]: cityname
+     */
+    // ------------------------------------------------------------------------
+    private static final int ENTRY_LENGTH = (Unsafe.ARRAY_BYTE_BASE_OFFSET);
+    private static final int ENTRY_SUM = (ENTRY_LENGTH + Byte.BYTES);
+    private static final int ENTRY_MIN = (ENTRY_SUM + Long.BYTES);
+    private static final int ENTRY_MAX = (ENTRY_MIN + Integer.BYTES);
+    private static final int ENTRY_COUNT = (ENTRY_MAX + Integer.BYTES);
+    private static final int ENTRY_NAME = (ENTRY_COUNT + Integer.BYTES);
+    private static final int ENTRY_NAME_8 = ENTRY_NAME + 8;
+    private static final int ENTRY_NAME_16 = ENTRY_NAME + 16;
+
+    private static final int ENTRY_BASESIZE_WHITESPACE = ENTRY_NAME + 7; // with enough empty bytes to fill a long
+    // ------------------------------------------------------------------------
+    private static final int PREMADE_MAX_SIZE = 1 << 5; // pre-initialize some entries in memory, keep them close
+    private static final int PREMADE_ENTRIES = 512; // amount of pre-created entries we should use
+    private static final int TABLE_SIZE = 1 << 19; // large enough for the contest.
+    private static final int TABLE_MASK = (TABLE_SIZE - 1);
 
     public static void main(String[] args) throws Exception {
+
         // Calculate input segments.
-        final int numberOfChunks = Runtime.getRuntime().availableProcessors();
-        final long[] chunks = getSegments(numberOfChunks);
-
-        final Map<String, Entry> measurements = HashMap.newHashMap(1 << 10);
-        IntStream.range(0, chunks.length - 1)
-                .mapToObj(chunkIndex -> processMemoryArea(chunks[chunkIndex], chunks[chunkIndex + 1]))
-                .parallel()
-                .forEachOrdered(repo -> { // make sure it's ordered, no concurrent map
-                    for (Entry entry : repo) {
-                        if (entry != null)
-                            measurements.merge(turnLongArrayIntoString(entry.data, entry.length), entry, Entry::mergeWith);
+        final FileChannel fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
+        final long fileSize = fileChannel.size();
+        final long segmentSize = (fileSize + PROCESSORS - 1) / PROCESSORS;
+        final long mapAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
+
+        final Thread[] parallelThreads = new Thread[PROCESSORS - 1];
+
+        // This is where the entries will land:
+        final ConcurrentHashMap<String, byte[]> measurements = new ConcurrentHashMap(1 << 10);
+
+        // We create separate threads for twice the amount of processors.
+        long lastAddress = mapAddress;
+        final long endOfFile = mapAddress + fileSize;
+        for (int i = 0; i < PROCESSORS - 1; ++i) {
+
+            final long fromAddress = lastAddress;
+            final long toAddress = Math.min(endOfFile, fromAddress + segmentSize);
+
+            final Thread thread = new Thread(() -> {
+                // The actual work is done here:
+                final byte[][] table = processMemoryArea(fromAddress, toAddress, fromAddress == mapAddress);
+
+                for (byte[] entry : table) {
+                    if (entry != null) {
+                        measurements.merge(entryToName(entry), entry, CalculateAverage_royvanrijn::mergeEntry);
                     }
-                });
+                }
+            });
+            thread.start(); // start a.s.a.p.
+            parallelThreads[i] = thread;
+            lastAddress = toAddress;
+        }
+
+        // Use the current thread for the part of memory:
+        final byte[][] table = processMemoryArea(lastAddress, mapAddress + fileSize, false);
 
+        for (byte[] entry : table) {
+            if (entry != null) {
+                measurements.merge(entryToName(entry), entry, CalculateAverage_royvanrijn::mergeEntry);
+            }
+        }
+        // Wait for all threads to finish:
+        for (Thread thread : parallelThreads) {
+            // Can we implement work-stealing? Not sure how...
+            thread.join();
+        }
+
+        // If we don't reach start of file,
         System.out.print("{" +
-                measurements.entrySet().stream().sorted(Map.Entry.comparingByKey()).map(Object::toString).collect(Collectors.joining(", ")));
+                measurements.entrySet().stream().sorted(Map.Entry.comparingByKey())
+                        .map(entry -> entry.getKey() + '=' + entryValuesToString(entry.getValue()))
+                        .collect(Collectors.joining(", ")));
         System.out.println("}");
+
+        // System.out.println(measurements.entrySet().stream().mapToLong(e -> UNSAFE.getInt(e.getValue(), ENTRY_COUNT + Unsafe.ARRAY_BYTE_BASE_OFFSET)).sum());
     }
 
-    /**
-     * Simpler way to get the segments and launch parallel processing by thomaswue
-     */
-    private static long[] getSegments(final int numberOfChunks) throws IOException {
-        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
-            final long fileSize = fileChannel.size();
-            final long segmentSize = (fileSize + numberOfChunks - 1) / numberOfChunks;
-            final long[] chunks = new long[numberOfChunks + 1];
-            final long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
-            chunks[0] = mappedAddress;
-            final long endAddress = mappedAddress + fileSize;
-            for (int i = 1; i < numberOfChunks; ++i) {
-                long chunkAddress = mappedAddress + i * segmentSize;
-                // Align to first row start.
-                while (chunkAddress < endAddress && UNSAFE.getByte(chunkAddress++) != '\n') {
-                    // nop
+    private static byte[] fillEntry(final byte[] entry, final long fromAddress, final int length, final int temp) {
+        UNSAFE.putLong(entry, ENTRY_SUM, temp);
+        UNSAFE.putInt(entry, ENTRY_MIN, temp);
+        UNSAFE.putInt(entry, ENTRY_MAX, temp);
+        UNSAFE.putInt(entry, ENTRY_COUNT, 1);
+        UNSAFE.putByte(entry, ENTRY_LENGTH, (byte) length);
+        UNSAFE.copyMemory(null, fromAddress, entry, ENTRY_NAME, length);
+        return entry;
+    }
+
+    public static void updateEntry(final byte[] entry, final int temp) {
+
+        int entryMin = UNSAFE.getInt(entry, ENTRY_MIN);
+        int entryMax = UNSAFE.getInt(entry, ENTRY_MAX);
+
+        entryMin = Math.min(temp, entryMin);
+        entryMax = Math.max(temp, entryMax);
+
+        long entrySum = UNSAFE.getLong(entry, ENTRY_SUM) + temp;
+        int entryCount = UNSAFE.getInt(entry, ENTRY_COUNT) + 1;
+
+        UNSAFE.putInt(entry, ENTRY_MIN, entryMin);
+        UNSAFE.putInt(entry, ENTRY_MAX, entryMax);
+        UNSAFE.putInt(entry, ENTRY_COUNT, entryCount);
+        UNSAFE.putLong(entry, ENTRY_SUM, entrySum);
+    }
+
+    public static byte[] mergeEntry(final byte[] entry, final byte[] merge) {
+
+        long sum = UNSAFE.getLong(merge, ENTRY_SUM);
+        final int mergeMin = UNSAFE.getInt(merge, ENTRY_MIN);
+        final int mergeMax = UNSAFE.getInt(merge, ENTRY_MAX);
+        int count = UNSAFE.getInt(merge, ENTRY_COUNT);
+
+        sum += UNSAFE.getLong(entry, ENTRY_SUM);
+        int entryMin = UNSAFE.getInt(entry, ENTRY_MIN);
+        int entryMax = UNSAFE.getInt(entry, ENTRY_MAX);
+        count += UNSAFE.getInt(entry, ENTRY_COUNT);
+
+        entryMin = Math.min(entryMin, mergeMin);
+        entryMax = Math.max(entryMax, mergeMax);
+
+        UNSAFE.putLong(entry, ENTRY_SUM, sum);
+        UNSAFE.putInt(entry, ENTRY_MIN, entryMin);
+        UNSAFE.putInt(entry, ENTRY_MAX, entryMax);
+        UNSAFE.putInt(entry, ENTRY_COUNT, count);
+        return entry;
+    }
+
+    private static String entryToName(final byte[] entry) {
+        // Get the length from memory:
+        int length = UNSAFE.getByte(entry, ENTRY_LENGTH);
+
+        byte[] name = new byte[length];
+        UNSAFE.copyMemory(entry, ENTRY_NAME, name, Unsafe.ARRAY_BYTE_BASE_OFFSET, length);
+
+        // Create a new String with the existing byte[]:
+        return new String(name, StandardCharsets.UTF_8);
+    }
+
+    private static String entryValuesToString(final byte[] entry) {
+        return round(UNSAFE.getInt(entry, ENTRY_MIN))
+                + "/" +
+                round((1.0 * UNSAFE.getLong(entry, ENTRY_SUM)) /
+                        UNSAFE.getInt(entry, ENTRY_COUNT))
+                + "/" +
+                round(UNSAFE.getInt(entry, ENTRY_MAX));
+    }
+
+    // Print a piece of memory:
+    // For debug.
+    private static String printMemory(final Object target, final long address, int length) {
+        String result = "";
+        for (int i = 0; i < length; i++) {
+            result += (char) UNSAFE.getByte(target, address + i);
+        }
+        return result;
+    }
+
+    // Print a piece of memory:
+    // For debug.
+    private static String printMemory(final long value, int length) {
+        String result = "";
+        for (int i = 0; i < length; i++) {
+            result += (char) ((value >> (i << 3)) & 0xFF);
+        }
+        return result;
+    }
+
+    private static double round(final double value) {
+        return Math.round(value) / 10.0;
+    }
+
+    private static final class Reader {
+
+        private long ptr;
+        private long delimiterMask;
+        private long lastRead;
+        private long lastReadMinOne;
+
+        private long hash;
+        private long entryStart;
+        private long entryDelimiter;
+
+        private final long endAddress;
+
+        Reader(final long startAddress, final long endAddress, final boolean isFileStart) {
+
+            this.ptr = startAddress;
+            this.endAddress = endAddress;
+
+            // Adjust start to next delimiter:
+            if (!isFileStart) {
+                ptr--;
+                while (ptr < endAddress) {
+                    if (UNSAFE.getByte(ptr++) == '\n') {
+                        break;
+                    }
                 }
-                chunks[i] = Math.min(chunkAddress, endAddress);
             }
-            chunks[numberOfChunks] = endAddress;
-            return chunks;
         }
-    }
 
-    // This is where I store the hashtable entry data in the "hot loop"
-    // The long[] contains the name in bytes (yeah, confusing)
-    // I've tried flyweight-ing, carrying all the data in a single byte[],
-    // where you offset type-indices: min:int,max:int,count:int,etc.
-    //
-    // The performance was just a little worse than this simple class.
-    static final class Entry {
-
-        private int min, max, count;
-        private byte length;
-        private long sum;
-        private final long[] data;
-
-        Entry(final long[] data, byte length, int temp) {
-            this.data = data;
-            this.length = length;
-            this.min = temp;
-            this.max = temp;
-            this.sum = temp;
-            this.count = 1;
+        private void processStart() {
+            hash = 0;
+            entryStart = ptr;
         }
 
-        public void updateWith(int measurement) {
-            min = Math.min(min, measurement);
-            max = Math.max(max, measurement);
-            sum += measurement;
-            count++;
+        private boolean hasNext() {
+            return (ptr < endAddress);
         }
 
-        public Entry mergeWith(Entry entry) {
-            min = Math.min(min, entry.min);
-            max = Math.max(max, entry.max);
-            sum += entry.sum;
-            count += entry.count;
-            return this;
+        private static final long DELIMITER_MASK = 0x3B3B3B3B3B3B3B3BL;
+
+        private boolean readFirst() {
+            lastRead = UNSAFE.getLong(ptr);
+
+            final long match = lastRead ^ DELIMITER_MASK;
+            delimiterMask = (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
+
+            return delimiterMask == 0;
         }
 
-        public String toString() {
-            return round(min) + "/" + round((1.0 * sum) / count) + "/" + round(max);
+        private boolean readNext() {
+            lastReadMinOne = lastRead;
+            return readFirst();
         }
 
-        private static double round(double value) {
-            return Math.round(value) / 10.0;
+        private void processName() {
+            hash ^= lastRead;
+            ptr += 8;
         }
-    }
 
-    // Only parse the String at the final end, when we have only the needed entries left that we need to output:
-    private static String turnLongArrayIntoString(final long[] data, final int length) {
-        // Create our target byte[]
-        final byte[] bytes = new byte[length];
-        // The power of magic allows us to just copy the memory in there.
-        UNSAFE.copyMemory(data, Unsafe.ARRAY_LONG_BASE_OFFSET, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, length);
-        // And construct a String()
-        return new String(bytes, StandardCharsets.UTF_8);
-    }
+        private int processEndAndGetTemperature() {
+            processFinalBytes();
 
-    private static Entry createNewEntry(final long fromAddress, final int lengthLongs, final byte lengthBytes, final int temp) {
-        // Make a copy of our working buffer, store this in a new Entry:
-        final long[] bufferCopy = new long[lengthLongs];
-        // Just copy everything over, bytes into the long[]
-        UNSAFE.copyMemory(null, fromAddress, bufferCopy, Unsafe.ARRAY_BYTE_BASE_OFFSET, lengthBytes);
-        return new Entry(bufferCopy, lengthBytes, temp);
-    }
+            finalizeHash();
+            finalizeDelimiter();
 
-    private static final int TABLE_SIZE = 1 << 19;
-    private static final int TABLE_MASK = (TABLE_SIZE - 1);
+            return readTemperature();
+        }
 
-    private static Entry[] processMemoryArea(final long fromAddress, final long toAddress) {
+        private void processFinalBytes() {
+            // Shift and read the last bytes:
+            lastRead &= ((delimiterMask >>> 7) - 1);
+        }
 
-        int packedBytes;
-        long hash;
-        long ptr = fromAddress;
-        long word;
-        long mask;
+        private void finalizeHash() {
+            // Finalize hash:
+            hash ^= lastRead;
+            hash ^= hash >> 32;
+            hash ^= hash >> 17; // extra entropy
+        }
 
-        final Entry[] table = new Entry[TABLE_SIZE];
+        private void finalizeDelimiter() {
+            // Found delimiter:
+            entryDelimiter = ptr + (Long.numberOfTrailingZeros(delimiterMask) >> 3);
+        }
 
-        // Go from start to finish address through the bytes:
-        while (ptr < toAddress) {
+        private static final long DOT_BITS = 0x10101000;
+        private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
 
-            final long startAddress = ptr;
+        // Awesome idea of merykitty:
+        private int readTemperature() {
+            // This is the number part: X.X, -X.X, XX.x or -XX.X
+            long numberBytes = UNSAFE.getLong(entryDelimiter + 1);
+            long invNumberBytes = ~numberBytes;
+
+            int dotPosition = Long.numberOfTrailingZeros(invNumberBytes & DOT_BITS);
+
+            // Update the pointer here, bit awkward, but we have all the data
+            ptr = entryDelimiter + (dotPosition >> 3) + 4;
+
+            int min28 = (28 - dotPosition);
+            // Calculates the sign
+            final long signed = (invNumberBytes << 59) >> 63;
+            final long minusFilter = ~(signed & 0xFF);
+            // Use the pre-calculated decimal position to adjust the values
+            long digits = ((numberBytes & minusFilter) << min28) & 0x0F000F0F00L;
+            // Multiply by a magic (100 * 0x1000000 + 10 * 0x10000 + 1), to get the result
+            final long absValue = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
+            // And perform abs()
+            return (int) ((absValue + signed) ^ signed); // non-patented method of doing the same trick
+        }
 
-            packedBytes = 1;
-            hash = 0;
-            word = UNSAFE.getLong(ptr);
-            mask = getDelimiterMask(word);
-
-            // Removed writing to a buffer here, why would we, we know the address and we'll need to check there anyway.
-            while (mask == 0) {
-                // If the mask is zero, we have no ';'
-                packedBytes++;
-                // So we continue building the hash:
-                hash ^= word;
-                ptr += 8;
-
-                // And getting a new value and mask:
-                word = UNSAFE.getLong(ptr);
-                mask = getDelimiterMask(word);
+        private boolean matchesEntryFull(final byte[] entry) {
+            int longs = (int) (entryDelimiter - entryStart) >> 3;
+            int step = 0;
+            for (int i = 0; i < longs - 2; i++) {
+                if (UNSAFE.getLong(entryStart + step) != UNSAFE.getLong(entry, ENTRY_NAME + step)) {
+                    return false;
+                }
+                step += 8;
             }
+            if (lastReadMinOne != UNSAFE.getLong(entry, (ENTRY_NAME_8) + step)) {
+                return false;
+            }
+            if (lastRead != UNSAFE.getLong(entry, (ENTRY_NAME_16) + step)) {
+                return false;
+            }
+            return true;
 
-            // Found delimiter:
-            final int delimiterByte = Long.numberOfTrailingZeros(mask);
-            final long delimiterAddress = ptr + (delimiterByte >> 3);
-
-            // Finish the masks and hash:
-            final long partialWord = word & ((mask >>> 7) - 1);
-            hash ^= partialWord;
-
-            // Read a long value from memory starting from the delimiter + 1, the number part:
-            final long numberBytes = UNSAFE.getLong(delimiterAddress + 1);
-            final long invNumberBytes = ~numberBytes;
-
-            // Adjust our pointer
-            final int decimalSepPos = Long.numberOfTrailingZeros(invNumberBytes & DOT_BITS);
-            ptr = delimiterAddress + (decimalSepPos >> 3) + 4;
-
-            // Calculate the final hash and index of the table:
-            int intHash = (int) (hash ^ (hash >> 32));
-            intHash = intHash ^ (intHash >> 17);
-            int index = intHash & TABLE_MASK;
-
-            // Find or insert the entry:
-            while (true) {
-                Entry tableEntry = table[index];
-                if (tableEntry == null) {
-                    final int temp = extractTemp(decimalSepPos, invNumberBytes, numberBytes);
-                    // Create a new entry:
-                    final byte length = (byte) (delimiterAddress - startAddress);
-                    table[index] = createNewEntry(startAddress, packedBytes, length, temp);
-                    break;
+        }
+
+        private boolean matchesEntryMedium(final byte[] entry) {
+            if (UNSAFE.getLong(entryStart) != UNSAFE.getLong(entry, ENTRY_NAME)) {
+                return false;
+            }
+            if (lastReadMinOne != UNSAFE.getLong(entry, ENTRY_NAME_8)) {
+                return false;
+            }
+            if (lastRead != UNSAFE.getLong(entry, ENTRY_NAME_16)) {
+                return false;
+            }
+            return true;
+        }
+
+        private boolean matchesEntryShort(final byte[] entry) {
+            if (lastReadMinOne != UNSAFE.getLong(entry, ENTRY_NAME)) {
+                return false;
+            }
+            if (lastRead != UNSAFE.getLong(entry, ENTRY_NAME_8)) {
+                return false;
+            }
+            return true;
+        }
+
+        private boolean matchesEnding(final byte[] entry) {
+            return lastRead == UNSAFE.getLong(entry, ENTRY_NAME);
+        }
+
+        private int length() {
+            return (int) (entryDelimiter - entryStart);
+
+        }
+
+    }
+
+    private static byte[][] processMemoryArea(final long startAddress, final long endAddress, boolean isFileStart) {
+
+        final byte[][] table = new byte[TABLE_SIZE][];
+        final byte[][] preConstructedEntries = new byte[PREMADE_ENTRIES][ENTRY_BASESIZE_WHITESPACE + PREMADE_MAX_SIZE];
+
+        final Reader reader = new Reader(startAddress, endAddress, isFileStart);
+
+        byte[] entry;
+        int entryCount = 0;
+
+        // Find the correct starting position
+        while (reader.hasNext()) {
+
+            reader.processStart();
+
+            if (!reader.readFirst()) {
+                int temperature = reader.processEndAndGetTemperature();
+
+                // Find or insert the entry:
+                int index = (int) (reader.hash & TABLE_MASK);
+                while (true) {
+                    entry = table[index];
+                    if (entry == null) {
+                        int length = reader.length();
+                        byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
+                                : new byte[ENTRY_BASESIZE_WHITESPACE + length];
+                        table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
+                        break;
+                    }
+                    else if (reader.matchesEnding(entry)) {
+                        updateEntry(entry, temperature);
+                        break;
+                    }
+                    else {
+                        // Move to the next index
+                        index = (index + 1) & TABLE_MASK;
+                    }
                 }
-                // Don't bother re-checking things here like hash or length.
-                // we'll need to check the content anyway if it's a hit, which is most times
-                else if (memoryEqualsEntry(startAddress, tableEntry.data, partialWord, packedBytes)) {
-                    // temperature, you're not temporary my friend
-                    final int temp = extractTemp(decimalSepPos, invNumberBytes, numberBytes);
-                    // No differences, same entry:
-                    tableEntry.updateWith(temp);
-                    break;
+            }
+            else {
+                reader.processName();
+
+                if (!reader.readNext()) {
+
+                    int temperature = reader.processEndAndGetTemperature();
+
+                    // Find or insert the entry:
+                    int index = (int) (reader.hash & TABLE_MASK);
+                    while (true) {
+                        entry = table[index];
+                        if (entry == null) {
+                            int length = reader.length();
+                            byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
+                                    : new byte[ENTRY_BASESIZE_WHITESPACE + length];
+                            table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
+                            break;
+                        }
+                        else if (reader.matchesEntryShort(entry)) {
+                            updateEntry(entry, temperature);
+                            break;
+                        }
+                        else {
+                            // Move to the next index
+                            index = (index + 1) & TABLE_MASK;
+                        }
+                    }
+                }
+                else {
+                    reader.processName();
+
+                    if (!reader.readNext()) {
+                        int temperature = reader.processEndAndGetTemperature();
+
+                        // Find or insert the entry:
+                        int index = (int) (reader.hash & TABLE_MASK);
+                        while (true) {
+                            entry = table[index];
+                            if (entry == null) {
+                                int length = reader.length();
+                                byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
+                                        : new byte[ENTRY_BASESIZE_WHITESPACE + length];
+                                table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
+                                break;
+                            }
+                            else if (reader.matchesEntryMedium(entry)) {
+                                updateEntry(entry, temperature);
+                                break;
+                            }
+                            else {
+                                // Move to the next index
+                                index = (index + 1) & TABLE_MASK;
+                            }
+                        }
+
+                    }
+                    else {
+
+                        reader.processName();
+                        while (reader.readNext()) {
+                            reader.processName();
+                        }
+
+                        int temperature = reader.processEndAndGetTemperature();
+
+                        // Find or insert the entry:
+                        int index = (int) (reader.hash & TABLE_MASK);
+                        while (true) {
+                            entry = table[index];
+                            if (entry == null) {
+                                int length = reader.length();
+                                byte[] entryBytes = (length < PREMADE_MAX_SIZE && entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
+                                        : new byte[ENTRY_BASESIZE_WHITESPACE + length]; // with enough room
+                                table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
+                                break;
+                            }
+                            else if (reader.matchesEntryFull(entry)) {
+                                updateEntry(entry, temperature);
+                                break;
+                            }
+                            else {
+                                // Move to the next index
+                                index = (index + 1) & TABLE_MASK;
+                            }
+                        }
+                    }
                 }
-                // Move to the next in the table, linear probing:
-                index = (index + 1) & TABLE_MASK;
             }
+
         }
         return table;
     }
@@ -277,52 +559,16 @@ else if (memoryEqualsEntry(startAddress, tableEntry.data, partialWord, packedByt
      * ---------------- BETTER SOFTWARE, FASTER --
      *
      * https://www.openvalue.eu/
-     *
-     * Made you look.
-     *
      */
 
-    private static final long DOT_BITS = 0x10101000;
-    private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
-
-    private static int extractTemp(final int decimalSepPos, final long invNumberBits, final long numberBits) {
-        // Awesome idea of merykitty:
-        int min28 = (28 - decimalSepPos);
-        // Calculates the sign
-        final long signed = (invNumberBits << 59) >> 63;
-        final long minusFilter = ~(signed & 0xFF);
-        // Use the pre-calculated decimal position to adjust the values
-        final long digits = ((numberBits & minusFilter) << min28) & 0x0F000F0F00L;
-        // Multiply by a magic (100 * 0x1000000 + 10 * 0x10000 + 1), to get the result
-        final long absValue = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
-        // And perform abs()
-        final int temp = (int) ((absValue + signed) ^ signed); // non-patented method of doing the same trick
-        return temp;
-    }
-
-    private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
-
-    // Takes a long and finds the bytes where this exact pattern is present.
-    // Cool bit manipulation technique: SWAR (SIMD as a Register).
-    private static long getDelimiterMask(final long word) {
-        final long match = word ^ SEPARATOR_PATTERN;
-        return (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
-        // I've put some brackets separating the first and second part, this is faster.
-        // Now they run simultaneous after 'match' is altered, instead of waiting on each other.
-    }
-
-    /**
-     * For case multiple hashes are equal (however unlikely) check the actual key (using longs)
-     */
-    private static boolean memoryEqualsEntry(final long startAddress, final long[] entry, final long finalBytes, final int amountLong) {
-        for (int i = 0; i < (amountLong - 1); i++) {
-            int step = i << 3; // step by 8 bytes
-            if (UNSAFE.getLong(startAddress + step) != entry[i])
-                return false;
+    private static Unsafe initUnsafe() {
+        try {
+            final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
         }
-        // If all previous 'whole' 8-packed byte-long values are equal
-        // We still need to check the final bytes that don't fit.
-        // and we've already calculated them for the hash.
-        return finalBytes == entry[amountLong - 1];
     }
 }

From f435d64dffdc43a8d5b59c09461fd49cc2b9f1c0 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Fri, 19 Jan 2024 22:17:54 +0100
Subject: [PATCH 072/268] Leaderboard update

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 3de3ef7e7..a9dd1ce15 100644
--- a/README.md
+++ b/README.md
@@ -41,24 +41,26 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
-| 1* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
-| 3* | 00:02.602 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
-|   | 00:02.692 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
-|   | 00:02.855 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+| 1* | 00:02.461 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
+| 1* | 00:02.477 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
+| 3* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
+| 3*  | 00:02.571 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+| 3* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
 |   | 00:02.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) |  |
+|   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
+|   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:04.726 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  | 
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
+|   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
-|   | 00:05.089 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.283 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
@@ -73,12 +75,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
-|   | 00:06.911 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:07.913 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [parkertimmins](https://github.com/parkertimmins) |  |
 |   | 00:08.045 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
-|   | 00:08.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
 |   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
@@ -96,6 +96,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:10.092 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-graal | [Pratham](https://github.com/phd3) |  |
 |   | 00:10.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
 |   | 00:10.553 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) |  |
+|   | 00:11.577 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
 |   | 00:11.119 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_lawrey.java)| 21.0.1-open | [lawrey](https://github.com/lawrey) |  |
 |   | 00:11.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_palmr.java)| 21.0.1-open | [Nick Palmer](https://github.com/palmr) |  |
@@ -112,7 +113,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
-|   | 00:14.225 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
@@ -139,7 +139,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |
 |   | 00:35.875 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java)| 21.0.1-open | [MahmoudFawzyKhalil](https://github.com/MahmoudFawzyKhalil) |  |
 |   | 00:36.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hchiorean.java)| 21.0.1-open | [Horia Chiorean](https://github.com/hchiorean) |  |
-|   | 00:36.212 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
+|   | 00:36.991 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:38.340 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_AbstractKamen.java)| 21.0.1-open | [AbstractKamen](https://github.com/AbstractKamen) |  |
 |   | 00:41.982 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_criccomini.java)| 21.0.1-open | [Chris Riccomini](https://github.com/criccomini) |  |
 |   | 00:42.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_javamak.java)| 21.0.1-open | [javamak](https://github.com/javamak) |  |
@@ -151,6 +151,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 01:07.014 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_pedestrianlove.java)| 21.0.1-open | [pedestrianlove](https://github.com/pedestrianlove) |  |
 |   | 01:08.811 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_alesj.java)| 21.0.1-open | [Aleš Justin](https://github.com/alesj) |  |
 |   | 01:08.908 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_itaske.java)| 21.0.1-open | [itaske](https://github.com/itaske) |  |
+|   | 01:09.595 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_agoncal.java)| 21.0.1-tem | [Antonio Goncalves](https://github.com/agoncal) |  |
 |   | 01:09.882 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rprabhu.java)| 21.0.1-open | [Prabhu R](https://github.com/rprabhu) |  |
 |   | 01:14.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anandmattikopp.java)| 21.0.1-open | [twohardthings](https://github.com/anandmattikopp) |  |
 |   | 01:25.801 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ivanklaric.java)| 21.0.1-open | [ivanklaric](https://github.com/ivanklaric) |  |

From eaf87689f5c9f949f5e710ae1a18c4e9ee5af4a8 Mon Sep 17 00:00:00 2001
From: Elliot Barlas <elliotbarlas@gmail.com>
Date: Sat, 20 Jan 2024 04:56:27 -0800
Subject: [PATCH 073/268] Use Arena MemorySegments rather than ByteBuffers.
 (#505)

---
 calculate_average_ebarlas.sh                  |   2 +-
 .../onebrc/CalculateAverage_ebarlas.java      | 257 ++++++++++--------
 2 files changed, 138 insertions(+), 121 deletions(-)

diff --git a/calculate_average_ebarlas.sh b/calculate_average_ebarlas.sh
index 422867d82..2bd59d4ba 100755
--- a/calculate_average_ebarlas.sh
+++ b/calculate_average_ebarlas.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS=""
+JAVA_OPTS="--enable-preview"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ebarlas
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
index 7c24afd76..c1ca6faac 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
@@ -18,9 +18,9 @@
 import sun.misc.Unsafe;
 
 import java.io.IOException;
-import java.nio.BufferUnderflowException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Paths;
@@ -30,9 +30,13 @@
 
 public class CalculateAverage_ebarlas {
 
+    private static final Arena ARENA = Arena.global();
+
     private static final int MAX_KEY_SIZE = 100;
+    private static final int MAX_VAL_SIZE = 5; // -dd.d
+    private static final int MAX_LINE_SIZE = MAX_KEY_SIZE + MAX_VAL_SIZE + 2; // key, semicolon, val, newline
     private static final int HASH_FACTOR = 433;
-    private static final int HASH_TBL_SIZE = 16_383; // range of allowed hash values, inclusive
+    private static final int HASH_TBL_SIZE = 32_767; // range of allowed hash values, inclusive
 
     private static final Unsafe UNSAFE = makeUnsafe();
 
@@ -50,7 +54,7 @@ private static Unsafe makeUnsafe() {
     public static void main(String[] args) throws IOException, InterruptedException {
         var path = Paths.get("measurements.txt");
         var channel = FileChannel.open(path, StandardOpenOption.READ);
-        var numPartitions = (int) Math.max((channel.size() / Integer.MAX_VALUE) + 1, Runtime.getRuntime().availableProcessors());
+        var numPartitions = Runtime.getRuntime().availableProcessors();
         var partitionSize = channel.size() / numPartitions;
         var partitions = new Partition[numPartitions];
         var threads = new Thread[numPartitions];
@@ -63,8 +67,8 @@ public static void main(String[] args) throws IOException, InterruptedException
             var pSize = pEnd - pStart;
             Runnable r = () -> {
                 try {
-                    var buffer = channel.map(FileChannel.MapMode.READ_ONLY, pStart, pSize).order(ByteOrder.LITTLE_ENDIAN);
-                    partitions[pIdx] = processBuffer(buffer, pIdx == 0);
+                    var ms = channel.map(FileChannel.MapMode.READ_ONLY, pStart, pSize, ARENA);
+                    partitions[pIdx] = processSegment(ms, pIdx == 0, pIdx == numPartitions - 1);
                 }
                 catch (IOException e) {
                     throw new RuntimeException(e);
@@ -142,7 +146,7 @@ private static void foldFootersAndHeaders(List<Partition> partitions) { // fold
             var merged = mergeFooterAndHeader(pPrev.footer, pNext.header);
             if (merged != null && merged.length != 0) {
                 if (merged[merged.length - 1] == '\n') { // fold into prev partition
-                    doProcessBuffer(ByteBuffer.wrap(merged).order(ByteOrder.LITTLE_ENDIAN), true, pPrev.stats);
+                    doProcessSegment(ARENA.allocateArray(ValueLayout.JAVA_BYTE, merged), 0, pPrev.stats, true);
                 }
                 else { // no newline appeared in partition, carry forward
                     pNext.footer = merged;
@@ -164,93 +168,121 @@ private static byte[] mergeFooterAndHeader(byte[] footer, byte[] header) {
         return merged;
     }
 
-    private static Partition processBuffer(ByteBuffer buffer, boolean first) {
-        return doProcessBuffer(buffer, first, new Stats[HASH_TBL_SIZE + 1]);
-    }
-
-    private static Partition doProcessBuffer(ByteBuffer buffer, boolean first, Stats[] stats) {
-        var header = first ? null : readHeader(buffer);
-        var keyStart = reallyDoProcessBuffer(buffer, stats);
-        var footer = keyStart < buffer.limit() ? readFooter(buffer, keyStart) : null;
-        return new Partition(header, footer, stats);
+    private static Partition processSegment(MemorySegment ms, boolean first, boolean last) {
+        var stats = new Stats[HASH_TBL_SIZE + 1]; // vals range from [0, size] inclusive
+        var header = first ? null : readHeader(ms);
+        var keyStart = doProcessSegment(ms, header == null ? 0 : header.offset, stats, last); // last segment is complete
+        var footer = keyStart < ms.byteSize() ? readFooter(ms, keyStart) : null;
+        return new Partition(header == null ? null : header.data, footer, stats);
     }
 
-    private static int reallyDoProcessBuffer(ByteBuffer buffer, Stats[] stats) {
-        long keyBaseAddr = UNSAFE.allocateMemory(MAX_KEY_SIZE);
-        int keyStart = 0; // start of key in buffer used for footer calc
-        try { // abort with exception to allow optimistic line processing
-            while (true) { // one line per iteration
-                keyStart = buffer.position(); // preserve line start
-                int keyHash = 0; // key hash code
-                long keyAddr = keyBaseAddr; // address for next int
-                int keyArrLen = 0; // number of key 4-byte ints
-                int keyLastBytes; // occupancy in last byte (1, 2, 3, or 4)
-                int val; // temperature value
-                while (true) {
-                    int n = buffer.getInt();
-                    byte b0 = (byte) (n & 0xFF);
-                    byte b1 = (byte) ((n >> 8) & 0xFF);
-                    byte b2 = (byte) ((n >> 16) & 0xFF);
-                    byte b3 = (byte) ((n >> 24) & 0xFF);
-                    if (b0 == ';') { // ...;1.1
-                        val = getVal(buffer, b1, b2, b3, buffer.get());
-                        keyLastBytes = 4;
-                        break;
-                    }
-                    else if (b1 == ';') { // ...a;1.1
-                        val = getVal(buffer, b2, b3, buffer.get(), buffer.get());
-                        UNSAFE.putInt(keyAddr, b0);
-                        keyLastBytes = 1;
-                        keyArrLen++;
-                        keyHash = HASH_FACTOR * keyHash + b0;
-                        break;
-                    }
-                    else if (b2 == ';') { // ...ab;1.1
-                        val = getVal(buffer, b3, buffer.get(), buffer.get(), buffer.get());
-                        UNSAFE.putInt(keyAddr, n & 0x0000FFFF);
-                        keyLastBytes = 2;
-                        keyArrLen++;
-                        keyHash = HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1;
-                        break;
-                    }
-                    else if (b3 == ';') { // ...abc;1.1
-                        UNSAFE.putInt(keyAddr, n & 0x00FFFFFF);
-                        keyLastBytes = 3;
-                        keyArrLen++;
-                        keyHash = HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1) + b2;
-                        n = buffer.getInt();
-                        b0 = (byte) (n & 0xFF);
-                        b1 = (byte) ((n >> 8) & 0xFF);
-                        b2 = (byte) ((n >> 16) & 0xFF);
-                        b3 = (byte) ((n >> 24) & 0xFF);
-                        val = getVal(buffer, b0, b1, b2, b3);
-                        break;
-                    }
-                    else {
-                        UNSAFE.putInt(keyAddr, n);
-                        keyArrLen++;
-                        keyAddr += 4;
-                        keyHash = HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1) + b2) + b3;
-                    }
+    private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stats, boolean complete) {
+        long cursor = ms.address() + offset;
+        long keyBaseAddr = UNSAFE.allocateMemory(MAX_KEY_SIZE); // reusable target for current key data
+        long lineStart = cursor; // start of key in segment used for footer calc
+        long limit = ms.address() + (complete ? ms.byteSize() : ms.byteSize() - MAX_LINE_SIZE); // stop short of longest line, sweep up at the end
+        while (cursor < limit) { // one line per iteration
+            lineStart = cursor; // preserve line start
+            int keyHash = 0; // key hash code
+            long keyAddr = keyBaseAddr; // address for next int
+            int keyArrLen = 0; // number of key 4-byte ints
+            int keyLastBytes; // occupancy in last byte (1, 2, 3, or 4)
+            byte b0, b1, b2, b3;
+            while (true) {
+                int n = UNSAFE.getInt(cursor);
+                cursor += 4;
+                b0 = (byte) (n & 0xFF);
+                b1 = (byte) ((n >> 8) & 0xFF);
+                b2 = (byte) ((n >> 16) & 0xFF);
+                b3 = (byte) ((n >> 24) & 0xFF);
+                if (b0 == ';') { // ...;1.1
+                    keyLastBytes = 4;
+                    b0 = b1;
+                    b1 = b2;
+                    b2 = b3;
+                    b3 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    break;
+                }
+                else if (b1 == ';') { // ...a;1.1
+                    int k = n & 0xFF;
+                    UNSAFE.putInt(keyAddr, k);
+                    keyLastBytes = 1;
+                    keyArrLen++;
+                    keyHash = HASH_FACTOR * keyHash + b0;
+                    b0 = b2;
+                    b1 = b3;
+                    b2 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    b3 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    break;
                 }
-                var idx = keyHash & HASH_TBL_SIZE;
-                var st = stats[idx];
-                if (st == null) { // nothing in table, eagerly claim spot
-                    st = stats[idx] = newStats(keyBaseAddr, keyArrLen, keyLastBytes, keyHash);
+                else if (b2 == ';') { // ...ab;1.1
+                    int k = n & 0xFFFF;
+                    UNSAFE.putInt(keyAddr, k);
+                    keyLastBytes = 2;
+                    keyArrLen++;
+                    keyHash = HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1;
+                    b0 = b3;
+                    b1 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    b2 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    b3 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    break;
                 }
-                else if (!equals(st.keyAddr, st.keyLen, keyBaseAddr, keyArrLen)) {
-                    st = findInTable(stats, keyHash, keyBaseAddr, keyArrLen, keyLastBytes);
+                else if (b3 == ';') { // ...abc;1.1
+                    int k = n & 0xFFFFFF;
+                    UNSAFE.putInt(keyAddr, k);
+                    keyLastBytes = 3;
+                    keyArrLen++;
+                    keyHash = HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1) + b2;
+                    n = UNSAFE.getInt(cursor);
+                    cursor += 4;
+                    b0 = (byte) (n & 0xFF);
+                    b1 = (byte) ((n >> 8) & 0xFF);
+                    b2 = (byte) ((n >> 16) & 0xFF);
+                    b3 = (byte) ((n >> 24) & 0xFF);
+                    break;
+                }
+                else {
+                    UNSAFE.putInt(keyAddr, n);
+                    keyArrLen++;
+                    keyAddr += 4;
+                    keyHash = HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1) + b2) + b3;
                 }
-                st.min = Math.min(st.min, val);
-                st.max = Math.max(st.max, val);
-                st.sum += val;
-                st.count++;
             }
+            var idx = keyHash & HASH_TBL_SIZE;
+            var st = stats[idx];
+            if (st == null) { // nothing in table, eagerly claim spot
+                st = stats[idx] = newStats(keyBaseAddr, keyArrLen, keyLastBytes, keyHash);
+            }
+            else if (!equals(st.keyAddr, st.keyLen, keyBaseAddr, keyArrLen)) {
+                st = findInTable(stats, keyHash, keyBaseAddr, keyArrLen, keyLastBytes);
+            }
+            int val;
+            if (b0 == '-') {
+                if (b2 != '.') { // 6 bytes: -dd.dn
+                    var b = UNSAFE.getByte(cursor);
+                    cursor += 2; // adv beyond digit and newline
+                    val = -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b - '0'));
+                }
+                else { // 5 bytes: -d.dn
+                    cursor++; // newline
+                    val = -((b1 - '0') * 10 + (b3 - '0'));
+                }
+            }
+            else {
+                if (b1 != '.') { // 5 bytes: dd.dn
+                    cursor++; // newline
+                    val = ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
+                }
+                else { // 4 bytes: d.dn
+                    val = (b0 - '0') * 10 + (b2 - '0');
+                }
+            }
+            st.min = Math.min(st.min, val);
+            st.max = Math.max(st.max, val);
+            st.sum += val;
+            st.count++;
         }
-        catch (BufferUnderflowException ignore) {
-
-        }
-        return keyStart;
+        return lineStart - ms.address();
     }
 
     private static boolean equals(long key1, int len1, long key2, int len2) {
@@ -261,7 +293,7 @@ private static boolean equals(long key1, int len1, long key2, int len2) {
             return UNSAFE.getLong(key1) == UNSAFE.getLong(key2);
         }
         if (len1 == 3) {
-            return UNSAFE.getInt(key1) == UNSAFE.getInt(key2) && UNSAFE.getInt(key1 + 4) == UNSAFE.getInt(key2 + 4);
+            return UNSAFE.getLong(key1) == UNSAFE.getLong(key2) && UNSAFE.getInt(key1 + 8) == UNSAFE.getInt(key2 + 8);
         }
         if (len1 == 1) {
             return UNSAFE.getInt(key1) == UNSAFE.getInt(key2);
@@ -278,29 +310,6 @@ private static boolean equals(long key1, int len1, long key2, int len2) {
         return true;
     }
 
-    private static int getVal(ByteBuffer buffer, byte b0, byte b1, byte b2, byte b3) {
-        if (b0 == '-') {
-            if (b2 != '.') { // 6 bytes: -dd.dn
-                var b = buffer.get();
-                buffer.get(); // newline
-                return -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b - '0'));
-            }
-            else { // 5 bytes: -d.dn
-                buffer.get(); // newline
-                return -((b1 - '0') * 10 + (b3 - '0'));
-            }
-        }
-        else {
-            if (b1 != '.') { // 5 bytes: dd.dn
-                buffer.get(); // newline
-                return ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
-            }
-            else { // 4 bytes: d.dn
-                return (b0 - '0') * 10 + (b2 - '0');
-            }
-        }
-    }
-
     private static Stats findInTable(Stats[] stats, int hash, long keyAddr, int keyLen, int keyLastBytes) { // open-addressing scan
         var idx = hash & HASH_TBL_SIZE;
         var st = stats[idx];
@@ -321,18 +330,26 @@ private static Stats newStats(long keyAddr, int keyLen, int keyLastBytes, int ha
         return new Stats(k, keyLen, keyLastBytes, hash);
     }
 
-    private static byte[] readFooter(ByteBuffer buffer, int lineStart) { // read from line start to current pos (end-of-input)
-        var footer = new byte[buffer.limit() - lineStart];
-        buffer.get(lineStart, footer, 0, footer.length);
+    private static byte[] readFooter(MemorySegment ms, long offset) { // read from line start to current pos (end-of-input)
+        var footer = new byte[(int) (ms.byteSize() - offset)];
+        for (int i = 0; i < footer.length; i++) {
+            footer[i] = ms.get(ValueLayout.JAVA_BYTE, offset + i);
+        }
         return footer;
     }
 
-    private static byte[] readHeader(ByteBuffer buffer) { // read up to and including first newline (or end-of-input)
-        while (buffer.hasRemaining() && buffer.get() != '\n')
+    private static ByteArrayOffset readHeader(MemorySegment ms) { // read up to and including first newline (or end-of-input)
+        long offset = 0;
+        while (offset < ms.byteSize() && ms.get(ValueLayout.JAVA_BYTE, offset++) != '\n')
             ;
-        var header = new byte[buffer.position()];
-        buffer.get(0, header, 0, header.length);
-        return header;
+        var header = new byte[(int) offset];
+        for (int i = 0; i < offset; i++) {
+            header[i] = ms.get(ValueLayout.JAVA_BYTE, i);
+        }
+        return new ByteArrayOffset(header, offset);
+    }
+
+    record ByteArrayOffset(byte[] data, long offset) {
     }
 
     private static class Partition {

From 51f8ecfa434ea7a0fe9203b24b2f2e00f4abee14 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sat, 20 Jan 2024 14:01:46 +0100
Subject: [PATCH 074/268] Leaderboard update

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a9dd1ce15..7589efbad 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
-|   | 00:04.726 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  | 
+|   | 00:04.365 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
@@ -75,10 +75,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
+|   | 00:07.202 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:07.913 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [parkertimmins](https://github.com/parkertimmins) |  |
-|   | 00:08.045 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
 |   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |

From 8353a1cb3d8b834aa2faee92a124737fc6326cb1 Mon Sep 17 00:00:00 2001
From: Xylitol <jam.xylitol@gmail.com>
Date: Sat, 20 Jan 2024 21:04:19 +0800
Subject: [PATCH 075/268] Processing byte array backwards (#504)

---
 .../onebrc/CalculateAverage_C5H12O5.java      | 460 +++++++++++++-----
 1 file changed, 327 insertions(+), 133 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java b/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java
index a7baf9baf..4c0351a28 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java
@@ -15,136 +15,386 @@
  */
 package dev.morling.onebrc;
 
+import sun.misc.Unsafe;
+
 import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.reflect.Field;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.channels.AsynchronousFileChannel;
 import java.nio.channels.CompletionHandler;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Paths;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
-import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.FutureTask;
-import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.LinkedTransferQueue;
+import java.util.concurrent.TransferQueue;
 
 /**
- * Calculates the average using AIO and multiple threads.
+ * Results on Mac mini (Apple M2 with 8-core CPU / 8GB unified memory):
+ * <pre>
+ *   using AIO and multiple threads:
+ *     120.15s user 4.33s system 710% cpu 17.522 total
+ *
+ *   reduce the number of memory copies:
+ *      45.87s user 2.82s system 530% cpu  9.185 total
+ *
+ *   processing byte array backwards and using bitwise operation to find specific byte (inspired by thomaswue):
+ *      25.38s user 3.44s system 342% cpu  8.406 total
+ * </pre>
  *
  * @author Xylitol
  */
+@SuppressWarnings("unchecked")
 public class CalculateAverage_C5H12O5 {
-    private static final int BUFFER_CAPACITY = 1024 * 1024 * 10;
-    private static final int MAP_CAPACITY = 10000;
-    private static final int PROCESSORS = Runtime.getRuntime().availableProcessors();
-    private static final BlockingQueue<byte[]> BYTES_QUEUE = new LinkedBlockingQueue<>(PROCESSORS);
-    private static long readPosition;
+    private static final int AVAILABLE_PROCESSOR_NUM = Runtime.getRuntime().availableProcessors();
+    private static final int TRANSFER_QUEUE_CAPACITY = 1024 / 16 / AVAILABLE_PROCESSOR_NUM; // 1GB memory max
+    private static final int BYTE_BUFFER_CAPACITY = 1024 * 1024 * 16; // 16MB one time
+    private static final int EXPECTED_MAPPINGS_NUM = 10000;
+
+    /**
+     * Fragment the file into chunks.
+     */
+    private static long[] fragment(Path path) throws IOException {
+        long size = Files.size(path);
+        long chunk = size / AVAILABLE_PROCESSOR_NUM;
+        List<Long> positions = new ArrayList<>();
+        try (RandomAccessFile file = new RandomAccessFile(path.toFile(), "r")) {
+            long position = chunk;
+            for (int i = 0; i < AVAILABLE_PROCESSOR_NUM - 1; i++) {
+                if (position >= size) {
+                    break;
+                }
+                file.seek(position);
+                // move the position to the next newline byte
+                while (file.read() != '\n') {
+                    position++;
+                }
+                positions.add(++position);
+                position += chunk;
+            }
+        }
+        if (positions.isEmpty() || positions.getLast() < size) {
+            positions.add(size);
+        }
+        return positions.stream().mapToLong(Long::longValue).toArray();
+    }
 
     public static void main(String[] args) throws Exception {
-        System.out.println(calc("./measurements.txt"));
+        // fragment the input file
+        Path path = Path.of("./measurements.txt");
+        long[] positions = fragment(path);
+
+        // start the calculation tasks
+        FutureTask<Map<Station, MeasurementData>>[] tasks = new FutureTask[positions.length];
+        for (int i = 0; i < positions.length; i++) {
+            tasks[i] = new FutureTask<>(new Calculator(path, (i == 0 ? 0 : positions[i - 1]), positions[i]));
+            new Thread(tasks[i]).start();
+        }
+
+        // wait for the results
+        Map<Station, MeasurementData> result = HashMap.newHashMap(EXPECTED_MAPPINGS_NUM);
+        for (FutureTask<Map<Station, MeasurementData>> task : tasks) {
+            task.get().forEach((k, v) -> result.merge(k, v, MeasurementData::merge));
+        }
+
+        // sort and print the results
+        TreeMap<String, MeasurementData> sorted = new TreeMap<>();
+        for (Map.Entry<Station, MeasurementData> entry : result.entrySet()) {
+            sorted.put(new String(entry.getKey().bytes, StandardCharsets.UTF_8), entry.getValue());
+        }
+        System.out.println(sorted);
     }
 
     /**
-     * Calculate the average.
+     * The calculation task.
      */
-    public static String calc(String path) throws IOException, ExecutionException, InterruptedException {
-        readPosition = 0;
-        Map<String, MeasurementData> result = HashMap.newHashMap(MAP_CAPACITY);
-        // read and offer to queue
-        try (AsynchronousFileChannel channel = AsynchronousFileChannel.open(
-                Paths.get(path), Set.of(StandardOpenOption.READ), Executors.newVirtualThreadPerTaskExecutor())) {
-            ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFER_CAPACITY);
-            channel.read(buffer, readPosition, buffer, new CompletionHandler<>() {
+    private static class Calculator implements Callable<Map<Station, MeasurementData>> {
+        private final TransferQueue<byte[]> transfer = new LinkedTransferQueue<>();
+        private final AsynchronousFileChannel asyncChannel;
+        private final long limit;
+        private long position;
+
+        public Calculator(Path file, long position, long limit) throws IOException {
+            ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
+            this.asyncChannel = AsynchronousFileChannel.open(file, Set.of(StandardOpenOption.READ), executor);
+            this.position = position;
+            this.limit = limit;
+        }
+
+        @Override
+        public Map<Station, MeasurementData> call() throws InterruptedException {
+            ByteBuffer buffer = ByteBuffer.allocateDirect(BYTE_BUFFER_CAPACITY);
+            asyncChannel.read(buffer, position, buffer, new CompletionHandler<>() {
                 @Override
-                public void completed(Integer bytesRead, ByteBuffer buffer) {
-                    try {
-                        if (bytesRead > 0) {
-                            for (int i = buffer.position() - 1; i >= 0; i--) {
-                                if (buffer.get(i) == '\n') {
-                                    buffer.limit(i + 1);
-                                    break;
-                                }
-                            }
-                            buffer.flip();
-                            byte[] bytes = new byte[buffer.remaining()];
-                            buffer.get(bytes);
-                            readPosition += buffer.limit();
-                            BYTES_QUEUE.put(bytes);
-                            buffer.clear();
-                            channel.read(buffer, readPosition, buffer, this);
-                        }
-                        else {
-                            for (int i = 0; i < PROCESSORS; i++) {
-                                BYTES_QUEUE.put(new byte[0]);
+                public void completed(Integer readSize, ByteBuffer buffer) {
+                    if (position + readSize >= limit) {
+                        buffer.limit(readSize - (int) (position + readSize - limit));
+                    }
+                    else {
+                        for (int i = buffer.position() - 1; i >= 0; i--) {
+                            if (buffer.get(i) == '\n') {
+                                // truncate the buffer to the last newline byte
+                                buffer.limit(i + 1);
+                                break;
                             }
                         }
                     }
-                    catch (InterruptedException e) {
-                        Thread.currentThread().interrupt();
+                    buffer.flip();
+                    byte[] bytes = new byte[buffer.limit() + 1];
+                    // add a newline byte at the beginning
+                    bytes[0] = '\n';
+                    buffer.get(bytes, 1, buffer.limit());
+                    transfer(bytes);
+                    if ((position += buffer.limit()) < limit) {
+                        buffer.clear();
+                        asyncChannel.read(buffer, position, buffer, this);
+                    }
+                    else {
+                        // stop signal
+                        transfer(new byte[0]);
                     }
                 }
 
                 @Override
                 public void failed(Throwable exc, ByteBuffer buffer) {
-                    // ignore
+                    transfer(new byte[0]);
                 }
             });
+            return process();
+        }
 
-            @SuppressWarnings("unchecked")
-            FutureTask<Map<MeasurementName, MeasurementData>>[] tasks = new FutureTask[PROCESSORS];
-            for (int i = 0; i < PROCESSORS; i++) {
-                tasks[i] = new FutureTask<>(new Task());
-                new Thread(tasks[i]).start();
+        /**
+         * Transfer or put the bytes to the queue.
+         */
+        private void transfer(byte[] bytes) {
+            try {
+                if (transfer.size() >= TRANSFER_QUEUE_CAPACITY) {
+                    transfer.transfer(bytes);
+                }
+                else {
+                    transfer.put(bytes);
+                }
             }
-            for (FutureTask<Map<MeasurementName, MeasurementData>> task : tasks) {
-                task.get().forEach((k, v) -> result.merge(k.toString(), v, MeasurementData::merge));
+            catch (InterruptedException e) {
+                throw new RuntimeException(e);
             }
         }
-        return new TreeMap<>(result).toString();
+
+        /**
+         * Take and process the bytes from the queue.
+         */
+        private Map<Station, MeasurementData> process() throws InterruptedException {
+            Map<Station, MeasurementData> result = HashMap.newHashMap(EXPECTED_MAPPINGS_NUM);
+            for (byte[] bytes = transfer.take(); bytes.length > 0; bytes = transfer.take()) {
+                Station station = new Station(bytes);
+                // read the bytes backwards
+                for (int position = bytes.length - 2; position >= 1; position--) {
+
+                    // calculate the temperature value
+                    int temperature = bytes[position] - '0' + (bytes[position -= 2] - '0') * 10;
+                    byte unknownByte = bytes[--position];
+                    int semicolon = switch (unknownByte) {
+                        case ';' -> position;
+                        case '-' -> {
+                            temperature = -temperature;
+                            yield --position;
+                        }
+                        default -> {
+                            temperature += (unknownByte - '0') * 100;
+                            if (bytes[--position] == '-') {
+                                temperature = -temperature;
+                                --position;
+                            }
+                            yield position;
+                        }
+                    };
+
+                    // calculate the station name hash
+                    int hash = 1;
+                    while (true) {
+                        long temp = LineFinder.previousLong(bytes, position);
+                        int distance = LineFinder.NATIVE.fromRight(temp);
+                        if (distance == 0) {
+                            // current byte is '\n'
+                            break;
+                        }
+                        position -= distance;
+                        if (distance == 8) {
+                            // can't find '\n' in previous 8 bytes
+                            hash = 31 * hash + (int) (temp ^ (temp >>> 32));
+                            continue;
+                        }
+                        // clear the redundant bytes
+                        temp = LineFinder.NATIVE.clearLeft(temp, distance);
+                        hash = 31 * hash + (int) (temp ^ (temp >>> 32));
+                    }
+
+                    // merge data to the result map
+                    MeasurementData data = result.get(station.slice(hash, position + 1, semicolon));
+                    if (data == null) {
+                        result.put(station.copy(), new MeasurementData(temperature));
+                    } else {
+                        data.merge(temperature);
+                    }
+                }
+            }
+            return result;
+        }
     }
 
     /**
-     * The measurement name.
+     * To find the nearest newline byte position in a long.
      */
-    private record MeasurementName(byte[] bytes, int length) {
+    private interface LineFinder {
+        // choose the implementation according to the native byte order
+        LineFinder NATIVE = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? LELineFinder.INST : BELineFinder.INST;
 
-        @Override
-        public boolean equals(Object name) {
-            MeasurementName other = (MeasurementName) name;
-            if (other.length != length) {
-                return false;
+        Unsafe UNSAFE = initUnsafe();
+        int BYTE_ARRAY_BASE_OFFSET = UNSAFE.arrayBaseOffset(byte[].class);
+        int LONG_BYTES = Long.SIZE / Byte.SIZE;
+
+        static Unsafe initUnsafe() {
+            try {
+                Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+                theUnsafe.setAccessible(true);
+                return (Unsafe) theUnsafe.get(Unsafe.class);
+            }
+            catch (NoSuchFieldException | IllegalAccessException e) {
+                throw new RuntimeException(e);
             }
-            return Arrays.compare(bytes, 0, length, other.bytes, 0, length) == 0;
         }
 
-        @Override
-        public int hashCode() {
-            int result = 1;
-            for (int i = 0; i < length; i++) {
-                result = 31 * result + bytes[i];
+        static long previousLong(byte[] bytes, long offset) {
+            return UNSAFE.getLong(bytes, BYTE_ARRAY_BASE_OFFSET + offset + 1 - LONG_BYTES);
+        }
+
+        /**
+         * Mark the highest bit of newline byte (0x0A) to 1.
+         */
+        static long markHighestBit(long longBytes) {
+            long temp = longBytes ^ 0x0A0A0A0A0A0A0A0AL;
+            return (temp - 0x0101010101010101L) & ~temp & 0x8080808080808080L;
+        }
+
+        /**
+         * Find the nearest newline byte position from right to left.
+         */
+        int fromRight(long longBytes);
+
+        /**
+         * Clear the left bytes out of the range.
+         */
+        long clearLeft(long longBytes, int keepNum);
+
+        enum LELineFinder implements LineFinder {
+            INST;
+
+            private static final long[] MASKS = new long[8];
+
+            static {
+                for (int i = 1; i <= 7; i++) {
+                    MASKS[i] = 0xFFFFFFFFFFFFFFFFL << ((8 - i) << 3);
+                }
             }
-            return result;
+
+            @Override
+            public int fromRight(long longBytes) {
+                return Long.numberOfLeadingZeros(markHighestBit(longBytes)) >>> 3;
+            }
+
+            @Override
+            public long clearLeft(long longBytes, int keepNum) {
+                return longBytes & MASKS[keepNum];
+            }
+        }
+
+        enum BELineFinder implements LineFinder {
+            INST;
+
+            private static final long[] MASKS = new long[8];
+
+            static {
+                for (int i = 1; i <= 7; i++) {
+                    MASKS[i] = 0xFFFFFFFFFFFFFFFFL >>> ((8 - i) << 3);
+                }
+            }
+
+            @Override
+            public int fromRight(long longBytes) {
+                return Long.numberOfTrailingZeros(markHighestBit(longBytes)) >>> 3;
+            }
+
+            @Override
+            public long clearLeft(long longBytes, int keepNum) {
+                return longBytes & MASKS[keepNum];
+            }
+        }
+    }
+
+    /**
+     * The station name wrapper ( bytes[from, to) ).
+     */
+    private static class Station {
+        private final byte[] bytes;
+        private int from;
+        private int to;
+        private int hash;
+
+        public Station(byte[] bytes) {
+            this(bytes, 0, 0, 0);
+        }
+
+        public Station(byte[] bytes, int hash, int from, int to) {
+            this.bytes = bytes;
+            this.slice(hash, from, to);
+        }
+
+        public Station slice(int hash, int from, int to) {
+            this.hash = hash;
+            this.from = from;
+            this.to = to;
+            return this;
+        }
+
+        public Station copy() {
+            int length = to - from;
+            byte[] newBytes = new byte[length];
+            System.arraycopy(bytes, from, newBytes, 0, length);
+            return new Station(newBytes, hash, 0, length);
         }
 
         @Override
-        public String toString() {
-            return new String(bytes, 0, length, StandardCharsets.UTF_8);
+        public boolean equals(Object station) {
+            Station other = (Station) station;
+            return Arrays.equals(bytes, from, to, other.bytes, other.from, other.to);
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
         }
     }
 
     /**
-     * The measurement data.
+     * The measurement data wrapper ( temperature * 10 ).
      */
     private static class MeasurementData {
         private int min;
         private int max;
-        private int sum;
+        private long sum;
         private int count;
 
         public MeasurementData(int value) {
@@ -154,11 +404,15 @@ public MeasurementData(int value) {
             this.count = 1;
         }
 
-        public MeasurementData merge(MeasurementData data) {
-            return merge(data.min, data.max, data.sum, data.count);
+        public MeasurementData merge(int value) {
+            return merge(value, value, value, 1);
+        }
+
+        public MeasurementData merge(MeasurementData other) {
+            return merge(other.min, other.max, other.sum, other.count);
         }
 
-        public MeasurementData merge(int min, int max, int sum, int count) {
+        public MeasurementData merge(int min, int max, long sum, int count) {
             this.min = Math.min(this.min, min);
             this.max = Math.max(this.max, max);
             this.sum += sum;
@@ -168,67 +422,7 @@ public MeasurementData merge(int min, int max, int sum, int count) {
 
         @Override
         public String toString() {
-            return (min / 10.0) + "/" + (Math.round((double) sum / count) / 10.0) + "/" + (max / 10.0);
-        }
-    }
-
-    /**
-     * The task to calculate.
-     */
-    private static class Task implements Callable<Map<MeasurementName, MeasurementData>> {
-
-        @Override
-        public Map<MeasurementName, MeasurementData> call() throws InterruptedException {
-            // poll from queue and calculate
-            Map<MeasurementName, MeasurementData> result = HashMap.newHashMap(MAP_CAPACITY);
-            for (byte[] bytes = BYTES_QUEUE.take(); true; bytes = BYTES_QUEUE.take()) {
-                if (bytes.length == 0) {
-                    break;
-                }
-                int start = 0;
-                for (int end = 0; end < bytes.length; end++) {
-                    if (bytes[end] == '\n') {
-                        byte[] newBytes = new byte[end - start];
-                        System.arraycopy(bytes, start, newBytes, 0, newBytes.length);
-                        int semicolon = newBytes.length - 4;
-                        for (; semicolon >= 0; semicolon--) {
-                            if (newBytes[semicolon] == ';') {
-                                break;
-                            }
-                        }
-                        MeasurementName station = new MeasurementName(newBytes, semicolon);
-                        int value = toInt(newBytes, semicolon + 1);
-                        MeasurementData data = result.get(station);
-                        if (data != null) {
-                            data.merge(value, value, value, 1);
-                        }
-                        else {
-                            result.put(station, new MeasurementData(value));
-                        }
-                        start = end + 1;
-                    }
-                }
-            }
-            return result;
-        }
-
-        /**
-         * Convert the byte array to int.
-         */
-        private static int toInt(byte[] bytes, int start) {
-            boolean negative = false;
-            int result = 0;
-            for (int i = start; i < bytes.length; i++) {
-                byte b = bytes[i];
-                if (b == '-') {
-                    negative = true;
-                    continue;
-                }
-                if (b != '.') {
-                    result = result * 10 + (b - '0');
-                }
-            }
-            return negative ? -result : result;
+            return STR."\{min / 10.0}/\{Math.round((double) sum / count) / 10.0}/\{max / 10.0}";
         }
     }
 }

From e1a0b7925912f02aae27a170068ba78bb144cad7 Mon Sep 17 00:00:00 2001
From: Yonatan Graber <yonatan.graber@gmail.com>
Date: Sat, 20 Jan 2024 06:02:55 -0800
Subject: [PATCH 076/268] yonatang solution: a jdk8 friendly, no unsafe code,
 epsilon-gc friendly solution (#499)

* 1bc challenge, but one that will run using jdk 8 without unsafe and still do reasonably well.

* Better hashtable

* the fastest GC is no GC

* cleanups

* increased hash size

* removed Playground.java

* collision-handling allocation free hashmap

* formatting
---
 calculate_average_yonatang.sh                 |  20 ++
 github_users.txt                              |   1 +
 prepare_yonatang.sh                           |  20 ++
 .../onebrc/CalculateAverage_yonatang.java     | 320 ++++++++++++++++++
 4 files changed, 361 insertions(+)
 create mode 100755 calculate_average_yonatang.sh
 create mode 100755 prepare_yonatang.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java

diff --git a/calculate_average_yonatang.sh b/calculate_average_yonatang.sh
new file mode 100755
index 000000000..6bc44bda3
--- /dev/null
+++ b/calculate_average_yonatang.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# GC is overrated
+JAVA_OPTS="-XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch -Xms512m -Xmx512m"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_yonatang
diff --git a/github_users.txt b/github_users.txt
index 54c443995..bd4726705 100644
--- a/github_users.txt
+++ b/github_users.txt
@@ -51,3 +51,4 @@ hundredwatt;Jason Nochlin
 gnmathur;Gaurav Mathur
 vemana;Subrahmanyam
 jincongho;Jin Cong Ho
+yonatang;Yonatan Graber
diff --git a/prepare_yonatang.sh b/prepare_yonatang.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_yonatang.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java b/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java
new file mode 100644
index 000000000..be02e5bd3
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java
@@ -0,0 +1,320 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class CalculateAverage_yonatang {
+    private static final String FILE = "./measurements.txt";
+
+    private static final int DICT_OFFSET_STATION = 2;
+    private static final int DICT_OFFSET_SUM = 1;
+    private static final int DICT_SIZE = 15000;
+    private static final int DICT_STATION_RECORD_SIZE = 13;
+    private static final int DICT_RECORD_SIZE = DICT_OFFSET_STATION + DICT_STATION_RECORD_SIZE;
+    private static final int DICT_SIZE_BYTES = DICT_SIZE * DICT_RECORD_SIZE;
+    private static final long[] DICT_ZERO_RECORD = new long[DICT_RECORD_SIZE];
+    private static final long DICT_BASELINE_MEASURES = ((long) Short.MAX_VALUE & 0xFFFF) | (((long) Short.MIN_VALUE & 0xFFFF) << 16);
+
+    public static class HashTable {
+
+        // Continuous array of [key, min, max, count, sum], which will be more CPU cache friendly.
+        private final long[] data = new long[DICT_SIZE_BYTES];
+
+        public HashTable() {
+            for (int i = 0; i < DICT_SIZE_BYTES; i += DICT_RECORD_SIZE) {
+                data[i] = DICT_BASELINE_MEASURES;
+            }
+        }
+
+        private int getIndex(long[] station) {
+            long key = 0;
+            short len = (short) (station[0] & 0xFF);
+            int longs = ((len + 1) / 8) + 1;
+            for (int i = 0; i < longs; i++) {
+                key = key ^ station[i];
+            }
+            int idx = Math.abs((int) (key % DICT_SIZE)) * DICT_RECORD_SIZE;
+
+            while (true) {
+                if (data[idx] == DICT_BASELINE_MEASURES) {
+                    break;
+                }
+                if (Arrays.equals(station, 0, longs,
+                        data,
+                        idx + DICT_OFFSET_STATION, idx + DICT_OFFSET_STATION + longs)) {
+                    break;
+                }
+                idx += DICT_RECORD_SIZE;
+                if (idx >= DICT_SIZE_BYTES) {
+                    idx = 0;
+                }
+            }
+            return idx;
+        }
+
+        private void addRawMeasurementAgg(long[] title, long measurements, long sum) {
+            int idx = getIndex(title);
+            short currentMin = (short) (data[idx] & 0xFFFF);
+            short currentMax = (short) ((data[idx] >> 16) & 0xFFFF);
+            int currentCount = (int) (data[idx] >> 32);
+
+            short thisMin = (short) (measurements & 0xFFFF);
+            short thisMax = (short) ((measurements >> 16) & 0xFFFF);
+            int thisCount = (int) (measurements >> 32);
+
+            thisMin = (short) Math.min(thisMin, currentMin);
+            thisMax = (short) Math.max(thisMax, currentMax);
+            thisCount += currentCount;
+
+            data[idx] = ((long) thisMin & 0xFFFF) | (((long) thisMax & 0xFFFF) << 16) | (((long) thisCount) << 32);
+
+            data[idx + DICT_OFFSET_SUM] += sum;
+            System.arraycopy(title, 0, data, idx + DICT_OFFSET_STATION, DICT_STATION_RECORD_SIZE);
+        }
+
+        public TreeMap<String, ResultRow> toMap() {
+            TreeMap<String, ResultRow> finalMap = new TreeMap<>();
+            byte[] bytes = new byte[128];
+            ByteBuffer bb = ByteBuffer.allocate(136);
+            bb.order(ByteOrder.nativeOrder());
+            for (int i = 0; i < DICT_SIZE_BYTES; i += DICT_RECORD_SIZE) {
+                if (data[i] == DICT_BASELINE_MEASURES)
+                    continue;
+
+                short min = (short) (data[i] & 0xFFFF);
+                short max = (short) ((data[i] >> 16) & 0xFFFF);
+                int count = (int) (data[i] >> 32);
+                long sum = data[i + DICT_OFFSET_SUM];
+                for (int j = 0; j < DICT_STATION_RECORD_SIZE; j++) {
+                    bb.putLong(data[i + DICT_OFFSET_STATION + j]);
+                }
+                bb.flip();
+                byte len = bb.get();
+                bb.get(1, bytes, 0, len);
+                bb.clear();
+                String station = new String(bytes, 0, len, Charset.defaultCharset());
+                finalMap.put(station, new ResultRow(min / 10.0, (sum / 10.0) / count, max / 10.0));
+
+            }
+            return finalMap;
+        }
+
+        public void addMeasurement(long[] title, short temp) {
+            int idx = getIndex(title);
+            short min = (short) (data[idx] & 0xFFFF);
+            short max = (short) ((data[idx] >> 16) & 0xFFFF);
+            int count = (int) (data[idx] >> 32);
+            min = (short) Math.min(min, temp);
+            max = (short) Math.max(max, temp);
+            count += 1;
+
+            data[idx] = ((long) min & 0xFFFF) | (((long) max & 0xFFFF) << 16) | (((long) count) << 32);
+            data[idx + DICT_OFFSET_SUM] += temp;
+            System.arraycopy(title, 0, data, idx + DICT_OFFSET_STATION, DICT_STATION_RECORD_SIZE);
+        }
+
+        public void mergeInto(HashTable other) {
+            long[] title = new long[DICT_STATION_RECORD_SIZE];
+            for (int i = 0; i < DICT_SIZE_BYTES; i += DICT_RECORD_SIZE) {
+                if (data[i] == DICT_BASELINE_MEASURES)
+                    continue;
+                System.arraycopy(data, i + DICT_OFFSET_STATION, title, 0, DICT_STATION_RECORD_SIZE);
+                other.addRawMeasurementAgg(title, data[i], data[i + DICT_OFFSET_SUM]);
+            }
+        }
+
+    }
+
+    private static class ResultRow {
+        final double min;
+        final double mean;
+        final double max;
+
+        ResultRow(double min, double mean, double max) {
+            this.min = min;
+            this.mean = mean;
+            this.max = max;
+        }
+
+        public String toString() {
+            return round(min) + "/" + round(mean) + "/" + round(max);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+    public static boolean parseStation(MappedByteBuffer byteBuffer, ByteBuffer tempBb, long[] station) {
+        System.arraycopy(DICT_ZERO_RECORD, 0, station, 0, DICT_STATION_RECORD_SIZE);
+        byte len = 1;
+        boolean valid = false;
+        tempBb.clear();
+        tempBb.put((byte) 0);
+        while (byteBuffer.hasRemaining()) {
+            byte ch = byteBuffer.get();
+            if (ch == '\n') {
+                continue;
+            }
+            if (ch == ';') {
+                valid = true;
+                break;
+            }
+            tempBb.put(ch);
+            // long theNew = ((long) ch) << (len * 8);
+            // stationId[0] = stationId[0] ^ theNew;
+            // int arrIdx = len / 8;
+            // station[arrIdx] = station[arrIdx] ^ theNew;
+            len++;
+        }
+        tempBb.put(0, (byte) (len - 1));
+        if (!valid) {
+            return false;
+        }
+        tempBb.position(0);
+        tempBb.asLongBuffer().get(station);
+
+        int pivotIdx = (len) / 8;
+        long pivotBits = (len % 8) * 8;
+        long pivotMask = (1L << pivotBits) - 1;
+        station[pivotIdx] = station[pivotIdx] & pivotMask;
+        return true;
+    }
+
+    public static short parseShort(MappedByteBuffer byteBuffer) {
+        boolean valid = false;
+        boolean negative = false;
+        int num = 0;
+        while (byteBuffer.hasRemaining()) {
+            byte ch = byteBuffer.get();
+            if (ch == '\n') {
+                valid = true;
+                break;
+            }
+            if (ch == '-') {
+                negative = true;
+            }
+            else if (ch == '.') {
+                // noop
+            }
+            else {
+                num = (num * 10 + (ch - '0'));
+            }
+        }
+        if (!valid) {
+            return Short.MIN_VALUE;
+        }
+
+        return (short) (negative ? -num : num);
+    }
+
+    private static final int MARGIN = 130;
+
+    private static void processChunk(FileChannel fc, int j, long chunkSize, HashTable[] maps, boolean isLast) {
+        try {
+            HashTable agg = new HashTable();
+            maps[j] = agg;
+            long[] station = new long[DICT_STATION_RECORD_SIZE];
+            ByteBuffer tempBb = ByteBuffer.allocate((DICT_STATION_RECORD_SIZE + 1) * Long.BYTES);
+            tempBb.order(ByteOrder.nativeOrder());
+
+            long startIdx = Math.max(j * chunkSize - MARGIN, 0);
+            int padding;
+            if (isLast) {
+                chunkSize = fc.size() - startIdx;
+                padding = 0;
+            }
+            else {
+                padding = j == 0 ? 0 : MARGIN;
+            }
+            if (chunkSize == 0) {
+                return;
+            }
+            MappedByteBuffer byteBuffer = fc.map(FileChannel.MapMode.READ_ONLY, startIdx, chunkSize + padding);
+            // search back for the actual start line, at \n
+            if (startIdx > 0) {
+                int i = MARGIN;
+                while (i > 0) {
+                    byte ch = byteBuffer.get(i);
+                    if (ch == '\n') {
+                        break;
+                    }
+                    i--;
+                }
+                byteBuffer.position(i);
+            }
+
+            while (byteBuffer.hasRemaining()) {
+                if (!parseStation(byteBuffer, tempBb, station)) {
+                    continue;
+                }
+                short value = parseShort(byteBuffer);
+                if (value == Short.MIN_VALUE) {
+                    continue;
+                }
+                agg.addMeasurement(station, value);
+            }
+        }
+        catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        // long start = System.nanoTime();
+
+        File f = new File(FILE);
+        try (RandomAccessFile raf = new RandomAccessFile(f, "r");
+                FileChannel fc = raf.getChannel()) {
+
+            int chunks = f.length() < 1_048_576 ? 1 : (Runtime.getRuntime().availableProcessors());
+
+            long chunkSize = f.length() / chunks;
+
+            Thread[] threads = new Thread[chunks];
+            HashTable totalAgg = new HashTable();
+            HashTable[] maps = new HashTable[chunks];
+
+            for (int i = 0; i < chunks; i++) {
+                final int j = i;
+                Thread thread = new Thread(() -> processChunk(fc, j, chunkSize, maps, j == chunks - 1));
+                threads[i] = thread;
+                thread.start();
+            }
+            for (int i = 0; i < chunks; i++) {
+                threads[i].join();
+                maps[i].mergeInto(totalAgg);
+            }
+
+            Map<String, ResultRow> finalMap = totalAgg.toMap();
+            // long end = System.nanoTime();
+
+            System.out.println(finalMap);
+            // System.err.println("Total time: " + java.time.Duration.ofNanos(end - start).toMillis() + "ms");
+        }
+
+    }
+}

From 114ba76d20f946ac6421aff73cd69387b0cb15b7 Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Sat, 20 Jan 2024 20:06:31 +0100
Subject: [PATCH 077/268] jerrinot's improvement (#514)

* refactoring

* segregated heap for names

also a different hashing function. turns out hashing just first word is good enough
---
 .../onebrc/CalculateAverage_jerrinot.java     | 151 ++++++++----------
 1 file changed, 67 insertions(+), 84 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index 5373cb084..13e48ae05 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -38,7 +38,7 @@ public class CalculateAverage_jerrinot {
     // todo: with hyper-threading enable we would be better of with availableProcessors / 2;
     // todo: validate the testing env. params.
     private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
-    // private static final int THREAD_COUNT = 4;
+    // private static final int THREAD_COUNT = 1;
 
     private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
 
@@ -153,8 +153,9 @@ public static int ceilPow2(int i) {
     }
 
     private static class Processor implements Runnable {
-        private static final int MAP_SLOT_COUNT = ceilPow2(10000);
-        private static final int STATION_MAX_NAME_BYTES = 120;
+        private static final int MAX_UNIQUE_KEYS = 10000;
+        private static final int MAP_SLOT_COUNT = ceilPow2(MAX_UNIQUE_KEYS);
+        private static final int STATION_MAX_NAME_BYTES = 104;
 
         private static final long COUNT_OFFSET = 0;
         private static final long MIN_OFFSET = 4;
@@ -163,20 +164,20 @@ private static class Processor implements Runnable {
         private static final long LEN_OFFSET = 20;
         private static final long NAME_OFFSET = 24;
 
-        private static final int MAP_ENTRY_SIZE_BYTES = +Integer.BYTES // count // 0
+        private static final int MAP_ENTRY_SIZE_BYTES = Integer.BYTES // count // 0
                 + Integer.BYTES // min // +4
                 + Integer.BYTES // max // +8
                 + Long.BYTES // sum // +12
                 + Integer.BYTES // station name len // +20
-                + STATION_MAX_NAME_BYTES; // +24
+                + Long.BYTES; // station name ptr // 24
 
         private static final int MAP_SIZE_BYTES = MAP_SLOT_COUNT * MAP_ENTRY_SIZE_BYTES;
+        private static final int MAP_NAMES_BYTES = MAX_UNIQUE_KEYS * STATION_MAX_NAME_BYTES;
         private static final long MAP_MASK = MAP_SLOT_COUNT - 1;
 
-        // todo: some fields could probably be converted to locals
-
         private final long map;
-
+        private long currentNamesPtr;
+        private final long namesHi;
         private long cursorA;
         private long endA;
         private long cursorB;
@@ -240,7 +241,7 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
                     continue;
                 }
                 byte[] nameArr = new byte[(int) len];
-                long baseNameAddr = baseAddress + NAME_OFFSET;
+                long baseNameAddr = UNSAFE.getLong(baseAddress + NAME_OFFSET);
                 for (int i = 0; i < len; i++) {
                     nameArr[i] = UNSAFE.getByte(baseNameAddr + i);
                 }
@@ -270,6 +271,8 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
             this.endC = endC;
             this.endD = endD;
             this.map = UNSAFE.allocateMemory(MAP_SIZE_BYTES);
+            this.currentNamesPtr = UNSAFE.allocateMemory(MAP_NAMES_BYTES);
+            this.namesHi = currentNamesPtr + MAP_NAMES_BYTES;
 
             int i;
             for (i = 0; i < MAP_SIZE_BYTES; i += 8) {
@@ -278,6 +281,7 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
             for (i = i - 8; i < MAP_SIZE_BYTES; i++) {
                 UNSAFE.putByte(map + i, (byte) 0);
             }
+            UNSAFE.setMemory(currentNamesPtr, MAP_NAMES_BYTES, (byte) 0);
         }
 
         private void doTail() {
@@ -293,58 +297,56 @@ private void doTail() {
             // System.out.println("done D");
         }
 
-        private void doOne(long cursorA, long endA) {
-            while (cursorA < endA) {
-                long startA = cursorA;
-                long delimiterWordA = UNSAFE.getLong(cursorA);
-                long hashA = 0;
-                long maskA = getDelimiterMask(delimiterWordA);
-                while (maskA == 0) {
-                    hashA ^= delimiterWordA;
-                    cursorA += 8;
-                    delimiterWordA = UNSAFE.getLong(cursorA);
-                    maskA = getDelimiterMask(delimiterWordA);
+        private void doOne(long cursor, long endA) {
+            while (cursor < endA) {
+                long start = cursor;
+                long currentWord = UNSAFE.getLong(cursor);
+                long mask = getDelimiterMask(currentWord);
+                long maskedFirstWord = currentWord & ((mask - 1) ^ mask) >>> 8;
+                long hash = hash(maskedFirstWord);
+                while (mask == 0) {
+                    cursor += 8;
+                    currentWord = UNSAFE.getLong(cursor);
+                    mask = getDelimiterMask(currentWord);
                 }
-                final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
-                final long semicolonA = cursorA + (delimiterByteA >> 3);
-                final long maskedWordA = delimiterWordA & ((maskA - 1) ^ maskA) >>> 8;
-                hashA ^= maskedWordA;
-                int intHashA = (int) (hashA ^ (hashA >> 32));
-                intHashA = intHashA ^ (intHashA >> 17);
-
-                long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, intHashA, maskedWordA);
-                long temperatureWordA = UNSAFE.getLong(semicolonA + 1);
-                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA, temperatureWordA);
+                final int delimiterByte = Long.numberOfTrailingZeros(mask);
+                final long semicolon = cursor + (delimiterByte >> 3);
+                final long maskedWord = currentWord & ((mask - 1) ^ mask) >>> 8;
+                long baseEntryPtr = getOrCreateEntryBaseOffset(semicolon, start, (int) hash, maskedWord);
+                long temperatureWord = UNSAFE.getLong(semicolon + 1);
+                cursor = parseAndStoreTemperature(semicolon + 1, baseEntryPtr, temperatureWord);
             }
         }
 
+        private static long hash(long word1) {
+            // credit: mtopolnik
+            long seed = 0x51_7c_c1_b7_27_22_0a_95L;
+            int rotDist = 17;
+
+            long hash = word1;
+            hash *= seed;
+            hash = Long.rotateLeft(hash, rotDist);
+            return hash;
+        }
+
         @Override
         public void run() {
             while (cursorA < endA && cursorB < endB && cursorC < endC && cursorD < endD) {
-                // todo: experiment with different inter-leaving
                 long startA = cursorA;
                 long startB = cursorB;
                 long startC = cursorC;
                 long startD = cursorD;
 
                 long currentWordA = UNSAFE.getLong(startA);
-                // long delimiterWordA2 = UNSAFE.getLong(startA + 8);
                 long currentWordB = UNSAFE.getLong(startB);
-                // long delimiterWordB2 = UNSAFE.getLong(startB + 8);
                 long currentWordC = UNSAFE.getLong(startC);
-                // long delimiterWordCa = UNSAFE.getLong(startC + 8);
                 long currentWordD = UNSAFE.getLong(startD);
-                // long delimiterWordD2 = UNSAFE.getLong(startD + 8);
-
-                long hashA = 0;
-                long hashB = 0;
-                long hashC = 0;
-                long hashD = 0;
 
-                // credits for the hashing idea: royvanrijn
+                // credits for the hashing idea: mtopolnik
                 long maskA = getDelimiterMask(currentWordA);
+                long maskedFirstWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
+                long hashA = hash(maskedFirstWordA);
                 while (maskA == 0) {
-                    hashA ^= currentWordA;
                     cursorA += 8;
                     currentWordA = UNSAFE.getLong(cursorA);
                     maskA = getDelimiterMask(currentWordA);
@@ -353,13 +355,11 @@ public void run() {
                 final long semicolonA = cursorA + (delimiterByteA >> 3);
                 long temperatureWordA = UNSAFE.getLong(semicolonA + 1);
                 final long maskedWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
-                hashA ^= maskedWordA;
-                int intHashA = (int) (hashA ^ (hashA >> 32));
-                intHashA = intHashA ^ (intHashA >> 17);
 
                 long maskB = getDelimiterMask(currentWordB);
+                long maskedFirstWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
+                long hashB = hash(maskedFirstWordB);
                 while (maskB == 0) {
-                    hashB ^= currentWordB;
                     cursorB += 8;
                     currentWordB = UNSAFE.getLong(cursorB);
                     maskB = getDelimiterMask(currentWordB);
@@ -368,13 +368,11 @@ public void run() {
                 final long semicolonB = cursorB + (delimiterByteB >> 3);
                 long temperatureWordB = UNSAFE.getLong(semicolonB + 1);
                 final long maskedWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
-                hashB ^= maskedWordB;
-                int intHashB = (int) (hashB ^ (hashB >> 32));
-                intHashB = intHashB ^ (intHashB >> 17);
 
                 long maskC = getDelimiterMask(currentWordC);
+                long maskedFirstWordC = currentWordC & ((maskC - 1) ^ maskC) >>> 8;
+                long hashC = hash(maskedFirstWordC);
                 while (maskC == 0) {
-                    hashC ^= currentWordC;
                     cursorC += 8;
                     currentWordC = UNSAFE.getLong(cursorC);
                     maskC = getDelimiterMask(currentWordC);
@@ -383,13 +381,11 @@ public void run() {
                 final long semicolonC = cursorC + (delimiterByteC >> 3);
                 long temperatureWordC = UNSAFE.getLong(semicolonC + 1);
                 final long maskedWordC = currentWordC & ((maskC - 1) ^ maskC) >>> 8;
-                hashC ^= maskedWordC;
-                int intHashC = (int) (hashC ^ (hashC >> 32));
-                intHashC = intHashC ^ (intHashC >> 17);
 
                 long maskD = getDelimiterMask(currentWordD);
+                long maskedFirstWordD = currentWordD & ((maskD - 1) ^ maskD) >>> 8;
+                long hashD = hash(maskedFirstWordD);
                 while (maskD == 0) {
-                    hashD ^= currentWordD;
                     cursorD += 8;
                     currentWordD = UNSAFE.getLong(cursorD);
                     maskD = getDelimiterMask(currentWordD);
@@ -398,14 +394,11 @@ public void run() {
                 final long semicolonD = cursorD + (delimiterByteD >> 3);
                 long temperatureWordD = UNSAFE.getLong(semicolonD + 1);
                 final long maskedWordD = currentWordD & ((maskD - 1) ^ maskD) >>> 8;
-                hashD ^= maskedWordD;
-                int intHashD = (int) (hashD ^ (hashD >> 32));
-                intHashD = intHashD ^ (intHashD >> 17);
 
-                long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, intHashA, maskedWordA);
-                long baseEntryPtrB = getOrCreateEntryBaseOffset(semicolonB, startB, intHashB, maskedWordB);
-                long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, intHashC, maskedWordC);
-                long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, intHashD, maskedWordD);
+                long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, (int) hashA, maskedWordA);
+                long baseEntryPtrB = getOrCreateEntryBaseOffset(semicolonB, startB, (int) hashB, maskedWordB);
+                long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, (int) hashC, maskedWordC);
+                long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, (int) hashD, maskedWordD);
 
                 cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA, temperatureWordA);
                 cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB, temperatureWordB);
@@ -415,52 +408,42 @@ public void run() {
             doTail();
         }
 
-        private long getOrCreateEntryBaseOffset(long semicolonA, long startA, int intHashA, long maskedWordA) {
-            // hashSet.add(intHashA);
-            long lenLong = semicolonA - startA;
+        private long getOrCreateEntryBaseOffset(long semicolonPtr, long startPtr, int hash, long maskedWord) {
+            long lenLong = semicolonPtr - startPtr;
             int lenA = (int) lenLong;
 
-            // assert lenA != 0;
-            // byte[] nameArr = new byte[lenA];
-            // for (int i = 0; i < lenA; i++) {
-            // nameArr[i] = UNSAFE.getByte(startA + i);
-            // }
-            // String nameStr = new String(nameArr);
-            // Integer oldHash = nameToHash.put(nameStr, intHashA);
-            // assert oldHash == null || oldHash == intHashA : "name: " + nameStr + ", old hash = " + oldHash + ", new hash = " + intHashA;
-
-            long mapIndexA = intHashA & MAP_MASK;
-            // long clusterLen = 0;
+            long mapIndexA = hash & MAP_MASK;
             for (;;) {
                 long basePtr = mapIndexA * MAP_ENTRY_SIZE_BYTES + map;
                 long lenPtr = basePtr + LEN_OFFSET;
+                long namePtr = basePtr + NAME_OFFSET;
                 int len = UNSAFE.getInt(lenPtr);
                 if (len == lenA) {
-                    if (nameMatch(startA, maskedWordA, basePtr, lenLong)) {
-                        // if (clusterLen > maxClusterLen) {
-                        // maxClusterLen = clusterLen;
-                        // System.out.println("max cluster len: " + clusterLen);
-                        // }
+                    namePtr = UNSAFE.getLong(basePtr + NAME_OFFSET);
+                    if (nameMatch(startPtr, maskedWord, namePtr, lenLong)) {
                         return basePtr;
                     }
                 }
                 else if (len == 0) {
                     // todo: uncommon branch maybe?
                     // empty slot
-                    UNSAFE.copyMemory(semicolonA - lenA, basePtr + NAME_OFFSET, lenA);
+                    UNSAFE.putLong(namePtr, currentNamesPtr);
                     UNSAFE.putInt(lenPtr, lenA);
                     // todo: this could be a single putLong()
                     UNSAFE.putInt(basePtr + MAX_OFFSET, Integer.MIN_VALUE);
                     UNSAFE.putInt(basePtr + MIN_OFFSET, Integer.MAX_VALUE);
+                    UNSAFE.copyMemory(startPtr, currentNamesPtr, lenA);
+                    long consume = (lenLong & ~7L) + 8;
+                    currentNamesPtr += consume;
+                    assert currentNamesPtr <= namesHi;
                     return basePtr;
                 }
                 mapIndexA = ++mapIndexA & MAP_MASK;
-                // clusterLen++;
             }
         }
 
-        private static boolean nameMatch(long startA, long maskedWordA, long basePtr, long len) {
-            long namePtr = basePtr + NAME_OFFSET;
+        private static boolean nameMatch(long startA, long maskedWordA, long namePtr, long len) {
+            // long namePtr = basePtr + NAME_OFFSET;
             long fullLen = len & ~7L;
             long offset;
 

From 062f2bbecf586d85ff44dec42cc63f94e49bc6b8 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Sat, 20 Jan 2024 19:09:40 +0000
Subject: [PATCH 078/268] Introducing the vector api. 1s faster on 4 core i7
 (#506)

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 calculate_average_ianopolousfast.sh           |   2 +-
 .../CalculateAverage_ianopolousfast.java      | 104 +++++++++---------
 2 files changed, 51 insertions(+), 55 deletions(-)

diff --git a/calculate_average_ianopolousfast.sh b/calculate_average_ianopolousfast.sh
index e5c0977e0..06c31d9e5 100755
--- a/calculate_average_ianopolousfast.sh
+++ b/calculate_average_ianopolousfast.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview"
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ianopolousfast
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index 4bffe7839..8944a472f 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -15,6 +15,10 @@
  */
 package dev.morling.onebrc;
 
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.nio.ByteOrder;
@@ -30,19 +34,23 @@
 /* A fast implementation with no unsafe.
  * Features:
  * * memory mapped file using preview Arena FFI
+ * * semicolon finding using incubator vector api
  * * read chunks in parallel
  * * minimise allocation
  * * no unsafe
  *
  * Timings on 4 core i7-7500U CPU @ 2.70GHz:
  * average_baseline: 4m48s
- * ianopolous:         16s
+ * ianopolous:         15s
 */
 public class CalculateAverage_ianopolousfast {
 
     public static final int MAX_LINE_LENGTH = 107;
     public static final int MAX_STATIONS = 1 << 14;
     private static final OfLong LONG_LAYOUT = JAVA_LONG_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN);
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED.length() >= 32
+            ? ByteVector.SPECIES_256
+            : ByteVector.SPECIES_128;
 
     public static void main(String[] args) throws Exception {
         Arena arena = Arena.global();
@@ -165,58 +173,40 @@ public static Stat dedupeStation16(long start, long end, long hash, long first8,
         }
     }
 
-    public static long hasSemicolon(long d) {
-        // from Hacker's Delight page 92
-        d = d ^ 0x3b3b3b3b3b3b3b3bL;
-        long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
-        return ~(y | d | 0x7f7f7f7f7f7f7f7fL);
-    }
-
-    public static int getSemicolonIndex(long y) {
-        // from Hacker's Delight page 92
-        return Long.numberOfLeadingZeros(y) >> 3;
-    }
-
     static long maskHighBytes(long d, int nbytes) {
         return d & (-1L << ((8 - nbytes) * 8));
     }
 
     public static Stat parseStation(long lineStart, MemorySegment buffer, List<List<Stat>> stations) {
-        // find semicolon and update hash as we go, reading a long at a time
-        long d = buffer.get(LONG_LAYOUT, lineStart);
-        long hasSemi = hasSemicolon(d);
-        if (hasSemi != 0) {
-            int semiIndex = getSemicolonIndex(hasSemi);
-            d = maskHighBytes(d, semiIndex);
-            return dedupeStation8(lineStart, lineStart + semiIndex, d, d, buffer, stations);
-        }
-        long first8 = d;
-        long hash = d;
-
-        d = buffer.get(LONG_LAYOUT, lineStart + 8);
-        hasSemi = hasSemicolon(d);
-        if (hasSemi != 0) {
-            int semiIndex = getSemicolonIndex(hasSemi);
-            if (semiIndex == 0)
-                return dedupeStation8(lineStart, lineStart + 8, first8, first8, buffer, stations);
-            d = maskHighBytes(d, semiIndex);
-            return dedupeStation16(lineStart, lineStart + 8 + semiIndex, first8 ^ d, first8, d, buffer, stations);
+        ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart, ByteOrder.nativeOrder());
+        int keySize = line.compare(VectorOperators.EQ, ';').firstTrue();
+
+        if (keySize == BYTE_SPECIES.vectorByteSize()) {
+            while (buffer.get(JAVA_BYTE, lineStart + keySize) != ';') {
+                keySize++;
+            }
+            long first8 = buffer.get(LONG_LAYOUT, lineStart);
+            if (keySize < 8)
+                return dedupeStation8(lineStart, lineStart + keySize, first8, first8, buffer, stations);
+            long second8 = buffer.get(LONG_LAYOUT, lineStart + 8);
+            if (keySize < 16)
+                return dedupeStation16(lineStart, lineStart + keySize, first8 ^ second8, first8, second8, buffer, stations);
+            long hash = first8 ^ second8; // todo include other bytes
+            return dedupeStation(lineStart, lineStart + keySize, hash, first8, second8, buffer, stations);
         }
 
-        int index = 8;
-        long second8 = d;
-        while (hasSemi == 0) {
-            hash = hash ^ d;
-            index += 8;
-            d = buffer.get(LONG_LAYOUT, lineStart + index);
-            hasSemi = hasSemicolon(d);
+        long first8 = buffer.get(LONG_LAYOUT, lineStart);
+        if (keySize <= 8) {
+            first8 = maskHighBytes(first8, keySize & 0x07);
+            return dedupeStation8(lineStart, lineStart + keySize, first8, first8, buffer, stations);
         }
-        int semiIndex = getSemicolonIndex(hasSemi);
-        d = maskHighBytes(d, semiIndex);
-        if (semiIndex > 0) {
-            hash = hash ^ d;
+        long second8 = buffer.get(LONG_LAYOUT, lineStart + 8);
+        if (keySize < 16) {
+            second8 = maskHighBytes(second8, keySize & 0x07);
+            return dedupeStation16(lineStart, lineStart + keySize, first8 ^ second8, first8, second8, buffer, stations);
         }
-        return dedupeStation(lineStart, lineStart + index + semiIndex, hash, first8, second8, buffer, stations);
+        long hash = first8 ^ second8; // todo include later bytes
+        return dedupeStation(lineStart, lineStart + keySize, hash, first8, second8, buffer, stations);
     }
 
     public static int getDot(long d) {
@@ -266,24 +256,30 @@ public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySe
         for (int i = 0; i < MAX_STATIONS; i++)
             stations.add(null);
 
-        // Handle reading the very last line in the file
-        // this allows us to not worry about reading a long beyond the end
+        // Handle reading the very last few lines in the file
+        // this allows us to not worry about reading beyond the end
         // in the inner loop (reducing branches)
-        // We only need to read one because the min record size is 6 bytes
-        // so 2nd last record must be > 8 from end
+        // We need at least the vector lane size bytes back
         if (endByte == buffer.byteSize()) {
-            endByte -= 2; // skip final new line
-            while (endByte > 0 && buffer.get(JAVA_BYTE, endByte) != '\n')
+            endByte -= 1; // skip final new line
+            // reverse at least vector lane width
+            while (endByte > 0 && buffer.byteSize() - endByte < BYTE_SPECIES.vectorByteSize()) {
                 endByte--;
+                while (endByte > 0 && buffer.get(JAVA_BYTE, endByte) != '\n')
+                    endByte--;
+            }
 
             if (endByte > 0)
                 endByte++;
-            // copy into a 8n sized buffer to avoid reading off end
-            MemorySegment end = Arena.global().allocate(MAX_LINE_LENGTH + 4);
+            // copy into a larger buffer to avoid reading off end
+            MemorySegment end = Arena.global().allocate(MAX_LINE_LENGTH + BYTE_SPECIES.vectorByteSize());
             for (long i = endByte; i < buffer.byteSize(); i++)
                 end.set(JAVA_BYTE, i - endByte, buffer.get(JAVA_BYTE, i));
-            Stat station = parseStation(0, end, stations);
-            processTemperature(station.name.length + 1, end, station);
+            int index = 0;
+            while (endByte + index < buffer.byteSize()) {
+                Stat station = parseStation(index, end, stations);
+                index = (int) processTemperature(index + station.name.length + 1, end, station);
+            }
         }
 
         while (startByte < endByte) {

From 9100ed6316207b6963d31d3ae604a317932d2d13 Mon Sep 17 00:00:00 2001
From: Roman Musin <995612+roman-r-m@users.noreply.github.com>
Date: Sat, 20 Jan 2024 19:30:25 +0000
Subject: [PATCH 079/268] Epsilon GC + a number of other small tweaks (#513)

* Version 3

* Use SWAR algorithm from netty for finding a symbol in a string

* Faster equals - store the remainder in a long field (- 0.5s)

* optimise parsing numbers - prep

* Keep tweaking parsing logic

* Rewrote number parsing

may be a tiby bit faster it at all

* Epsilon GC
---
 calculate_average_roman-r-m.sh                |   6 +
 .../onebrc/CalculateAverage_roman_r_m.java    | 125 ++++++++----------
 2 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/calculate_average_roman-r-m.sh b/calculate_average_roman-r-m.sh
index 47626a1ac..fe468dcec 100755
--- a/calculate_average_roman-r-m.sh
+++ b/calculate_average_roman-r-m.sh
@@ -16,4 +16,10 @@
 #
 
 JAVA_OPTS="--enable-preview -XX:+UseTransparentHugePages"
+
+# epsilon GC needs enough memory or it makes things worse
+# see https://stackoverflow.com/questions/58087596/why-are-repeated-memory-allocations-observed-to-be-slower-using-epsilon-vs-g1
+# 2GB seems to be the sweet spot
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xmx2G -Xms2G -XX:+AlwaysPreTouch"
+
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_roman_r_m
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
index 5c4382458..a7df56e07 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
@@ -33,37 +33,35 @@ public class CalculateAverage_roman_r_m {
 
     private static Unsafe UNSAFE;
 
-    // based on http://0x80.pl/notesen/2023-03-06-swar-find-any.html
-    static long hasZeroByte(long l) {
-        return ((l - 0x0101010101010101L) & ~(l) & 0x8080808080808080L);
-    }
-
-    static long firstSetByteIndex(long l) {
-        return ((((l - 1) & 0x101010101010101L) * 0x101010101010101L) >> 56) - 1;
-    }
-
-    static long broadcast(byte b) {
+    private static long broadcast(byte b) {
         return 0x101010101010101L * b;
     }
 
-    static long SEMICOLON_MASK = broadcast((byte) ';');
-    static long LINE_END_MASK = broadcast((byte) '\n');
+    private static final long SEMICOLON_MASK = broadcast((byte) ';');
+    private static final long LINE_END_MASK = broadcast((byte) '\n');
+    private static final long DOT_MASK = broadcast((byte) '.');
+
+    // from netty
 
-    static long find(long l, long mask) {
-        long xor = l ^ mask;
-        long match = hasZeroByte(xor);
-        return match != 0 ? firstSetByteIndex(match) : -1;
+    /**
+     * Applies a compiled pattern to given word.
+     * Returns a word where each byte that matches the pattern has the highest bit set.
+     */
+    private static long applyPattern(final long word, final long pattern) {
+        long input = word ^ pattern;
+        long tmp = (input & 0x7F7F7F7F7F7F7F7FL) + 0x7F7F7F7F7F7F7F7FL;
+        return ~(tmp | input | 0x7F7F7F7F7F7F7F7FL);
     }
 
     static long nextNewline(long from, MemorySegment ms) {
         long start = from;
         long i;
         long next = ms.get(ValueLayout.JAVA_LONG_UNALIGNED, start);
-        while ((i = find(next, LINE_END_MASK)) < 0) {
+        while ((i = applyPattern(next, LINE_END_MASK)) == 0) {
             start += 8;
             next = ms.get(ValueLayout.JAVA_LONG_UNALIGNED, start);
         }
-        return start + i;
+        return start + Long.numberOfTrailingZeros(i) / 8;
     }
 
     static class Worker {
@@ -84,55 +82,53 @@ public Worker(FileChannel channel, long start, long end) {
 
         private void parseName(ByteString station) {
             long start = offset;
-            long pos = -1;
-
-            while (end - offset > 8) {
-                long next = UNSAFE.getLong(offset);
-                pos = find(next, SEMICOLON_MASK);
-                if (pos >= 0) {
-                    offset += pos;
-                    break;
-                }
-                else {
-                    offset += 8;
-                }
-            }
-            if (pos < 0) {
-                while (UNSAFE.getByte(offset++) != ';') {
-                }
-                offset--;
+            long pattern;
+            long next = UNSAFE.getLong(offset);
+            while ((pattern = applyPattern(next, SEMICOLON_MASK)) == 0) {
+                offset += 8;
+                next = UNSAFE.getLong(offset);
             }
+            int bytes = Long.numberOfTrailingZeros(pattern) / 8;
+            offset += bytes;
 
             int len = (int) (offset - start);
             station.offset = start;
             station.len = len;
             station.hash = 0;
+            station.tail = next & ((1L << (8 * bytes)) - 1);
 
             offset++;
         }
 
-        long parseNumberFast() {
+        int parseNumberFast() {
             long encodedVal = UNSAFE.getLong(offset);
 
-            var len = find(encodedVal, LINE_END_MASK);
-            offset += len + 1;
+            int neg = 1 - Integer.bitCount((int) (encodedVal & 0x10));
+            encodedVal >>>= 8 * neg;
+
+            var len = applyPattern(encodedVal, DOT_MASK);
+            len = Long.numberOfTrailingZeros(len) / 8;
 
             encodedVal ^= broadcast((byte) 0x30);
 
-            long c0 = len == 4 ? 100 : 10;
-            long c1 = 10 * (len - 3);
-            long c2 = 4 - len;
-            long c3 = len - 3;
-            long a = (encodedVal & 0xFF) * c0;
-            long b = ((encodedVal & 0xFF00) >>> 8) * c1;
-            long c = ((encodedVal & 0xFF0000L) >>> 16) * c2;
-            long d = ((encodedVal & 0xFF000000L) >>> 24) * c3;
+            int intPart = (int) (encodedVal & ((1 << (8 * len)) - 1));
+            intPart <<= 8 * (2 - len);
+            intPart *= (100 * 256 + 10);
+            intPart = (intPart & 0x3FF80) >>> 8;
+
+            int frac = (int) ((encodedVal >>> (8 * (len + 1))) & 0xFF);
 
-            return a + b + c + d;
+            offset += neg + len + 3; // 1 for . + 1 for fractional part + 1 for new line char
+            int sign = 1 - 2 * neg;
+            int val = intPart + frac;
+            return sign * val;
         }
 
-        long parseNumberSlow() {
-            long val = UNSAFE.getByte(offset++) - '0';
+        int parseNumberSlow() {
+            int neg = 1 - Integer.bitCount(UNSAFE.getByte(offset) & 0x10);
+            offset += neg;
+
+            int val = UNSAFE.getByte(offset++) - '0';
             byte b;
             while ((b = UNSAFE.getByte(offset++)) != '.') {
                 val = val * 10 + (b - '0');
@@ -140,22 +136,17 @@ long parseNumberSlow() {
             b = UNSAFE.getByte(offset);
             val = val * 10 + (b - '0');
             offset += 2;
+            val *= 1 - 2 * neg;
             return val;
         }
 
-        long parseNumber() {
-            long val;
-            int neg = 1 - Integer.bitCount(UNSAFE.getByte(offset) & 0x10);
-            offset += neg;
-
-            if (end - offset > 8) {
-                val = parseNumberFast();
+        int parseNumber() {
+            if (end - offset >= 8) {
+                return parseNumberFast();
             }
             else {
-                val = parseNumberSlow();
+                return parseNumberSlow();
             }
-            val *= 1 - 2 * neg;
-            return val;
         }
 
         public TreeMap<String, ResultRow> run() {
@@ -218,6 +209,7 @@ static final class ByteString {
         private long offset;
         private int len = 0;
         private int hash = 0;
+        private long tail = 0L;
 
         ByteString(MemorySegment ms) {
             this.ms = ms;
@@ -235,6 +227,7 @@ public ByteString copy() {
             copy.offset = this.offset;
             copy.len = this.len;
             copy.hash = this.hash;
+            copy.tail = this.tail;
             return copy;
         }
 
@@ -259,19 +252,7 @@ public boolean equals(Object o) {
                     return false;
                 }
             }
-            if (len >= 8) {
-                long l1 = UNSAFE.getLong(offset + len - 8);
-                long l2 = UNSAFE.getLong(that.offset + len - 8);
-                return l1 == l2;
-            }
-            for (; i < len; i++) {
-                byte i1 = UNSAFE.getByte(offset + i);
-                byte i2 = UNSAFE.getByte(that.offset + i);
-                if (i1 != i2) {
-                    return false;
-                }
-            }
-            return true;
+            return this.tail == that.tail;
         }
 
         @Override

From ac26c8b6446d0b1e556e99d68b995075dbb1cd45 Mon Sep 17 00:00:00 2001
From: Yann Moisan <yamo93@gmail.com>
Date: Sat, 20 Jan 2024 20:33:14 +0100
Subject: [PATCH 080/268] Improved version based on rafaelmerino (#511)

* files created by create_fork.sh

* use indexOf

* improved implementation based on rafaelmerino

---------

Co-authored-by: Yann Moisan <yann@zen.ly>
---
 calculate_average_YannMoisan.sh               |  19 ++
 prepare_YannMoisan.sh                         |  20 ++
 .../onebrc/CalculateAverage_YannMoisan.java   | 272 ++++++++++++++++++
 3 files changed, 311 insertions(+)
 create mode 100755 calculate_average_YannMoisan.sh
 create mode 100755 prepare_YannMoisan.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java

diff --git a/calculate_average_YannMoisan.sh b/calculate_average_YannMoisan.sh
new file mode 100755
index 000000000..74552f0c4
--- /dev/null
+++ b/calculate_average_YannMoisan.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_YannMoisan
diff --git a/prepare_YannMoisan.sh b/prepare_YannMoisan.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_YannMoisan.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java
new file mode 100644
index 000000000..0e9b5cfc4
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java
@@ -0,0 +1,272 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.ForkJoinPool;
+import java.util.function.Supplier;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+/**
+ * based on imrafaelmerino
+ * ./calculate_average_imrafaelmerino.sh  129.10s user 4.73s system 1395% cpu 9.591 total
+ *
+ * ./calculate_average_baseline.sh  193.27s user 5.81s system 100% cpu 3:17.85 total
+ *
+ * addition to copied implementation
+ * - use a Location object as a key in the Map to avoid String instantiations.
+ * ./calculate_average_YannMoisan.sh  118.36s user 5.72s system 1425% cpu 8.705 total
+ *
+ *  Model Name: MacBook Pro
+ *  Chip: Intel Core i9
+ *  Total Number of Cores: 8
+ *  Memory: 64 GB
+ *  */
+public class CalculateAverage_YannMoisan {
+
+    private static final String FILE = "./measurements.txt";
+    private static final int FIELD_SIZE = 128;
+
+    public static void main(String[] args) throws IOException {
+        var chunkSize = 1024 * 1024 * 50L; // Long.parseLong(args[0].trim());
+        var result = calculateStats(FILE, chunkSize);
+        System.out.println(result);
+    }
+
+    private static Map<String, Stat> calculateStats(String file,
+                                                    long chunkSize)
+            throws IOException {
+
+        try (var fileChannel = FileChannel.open(Paths.get(file),
+                StandardOpenOption.READ)) {
+            var stats = fileMemoryStream(fileChannel, chunkSize)
+                    .parallel()
+                    .map(p -> ManagedComputation.compute(() -> parse(p)))
+                    .reduce(Collections.emptyMap(),
+                            (stat1, stat2) -> combine(stat1, stat2));
+
+            var tm = new TreeMap<String, Stat>();
+            stats.forEach((k, v) -> tm.put(new String(k.value, 0, k.value.length), v));
+            return tm;
+        }
+
+    }
+
+    private static Map<Location, Stat> combine(Map<Location, Stat> xs,
+                                               Map<Location, Stat> ys) {
+
+        Map<Location, Stat> result = new HashMap<>();
+
+        for (var key : xs.keySet()) {
+            var m1 = xs.get(key);
+            var m2 = ys.get(key);
+            var combined = (m2 == null) ? m1 : (m1 == null) ? m2 : Stat.combine(m1, m2);
+            result.put(key, combined);
+        }
+
+        for (var key : ys.keySet())
+            result.putIfAbsent(key, ys.get(key));
+        return result;
+
+    }
+
+    private static Map<Location, Stat> parse(ByteBuffer bb) {
+        Map<Location, Stat> stats = new HashMap<>();
+        var limit = bb.limit();
+        var field = new byte[FIELD_SIZE];
+        while (bb.position() < limit) {
+            var fieldCurrentIndex = 0;
+            field[fieldCurrentIndex++] = bb.get();
+            while (bb.position() < limit) {
+                var fieldByte = bb.get();
+                if (fieldByte == ';')
+                    break;
+                field[fieldCurrentIndex++] = fieldByte;
+            }
+            var dst = new byte[fieldCurrentIndex];
+            System.arraycopy(field, 0, dst, 0, fieldCurrentIndex);
+            var fieldStr = new Location(dst);
+            // System.arraycopy(field, 0, dst, 0, fieldCurrentIndex);
+            var number = 0;
+            var sign = 1;
+            while (bb.position() < limit) {
+                var numberByte = bb.get();
+                if (numberByte == '-')
+                    sign = -1;
+                else if (numberByte == '\n')
+                    break;
+                else if (numberByte != '.')
+                    number = number * 10 + (numberByte - '0');
+            }
+            stats.computeIfAbsent(fieldStr,
+                    k -> new Stat())
+                    .update(sign * number);
+        }
+
+        return stats;
+    }
+
+    private static Stream<ByteBuffer> fileMemoryStream(FileChannel fileChannel,
+                                                       long chunkSize)
+            throws IOException {
+
+        var spliterator = Spliterators.spliteratorUnknownSize(fileMemoryIterator(fileChannel,
+                chunkSize),
+                Spliterator.IMMUTABLE);
+        return StreamSupport.stream(spliterator,
+                false);
+    }
+
+    private static Iterator<ByteBuffer> fileMemoryIterator(FileChannel fileChannel, long chunkSize) throws IOException {
+        return new Iterator<>() {
+
+            private final long size = fileChannel.size();
+            private long start = 0;
+
+            @Override
+            public boolean hasNext() {
+                return start < size;
+            }
+
+            @Override
+            public ByteBuffer next() {
+                try {
+                    var buffer = fileChannel.map(MapMode.READ_ONLY,
+                            start,
+                            Math.min(chunkSize,
+                                    size - start));
+                    var limmit = buffer.limit() - 1;
+                    while (buffer.get(limmit) != '\n')
+                        limmit--;
+                    limmit++;
+                    buffer.limit(limmit);
+                    start += limmit;
+                    return buffer;
+                }
+                catch (IOException ex) {
+                    throw new UncheckedIOException(ex);
+                }
+            }
+        };
+    }
+
+    private static final class Location {
+        public final byte[] value;
+
+        public Location(byte[] value) {
+            this.value = value;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o)
+                return true;
+            if (o == null || getClass() != o.getClass())
+                return false;
+            Location location = (Location) o;
+            return Arrays.equals(value, location.value);
+        }
+
+        @Override
+        public int hashCode() {
+            return Arrays.hashCode(value);
+        }
+    }
+
+    private static final class Stat {
+
+        private int min = Integer.MAX_VALUE;
+        private int max = Integer.MIN_VALUE;
+        private long sum = 0L;
+        private long count = 0L;
+
+        public static Stat combine(Stat m1,
+                                   Stat m2) {
+            var stat = new Stat();
+            stat.min = Math.min(m1.min, m2.min);
+            stat.max = Math.max(m1.max, m2.max);
+            stat.sum = m1.sum + m2.sum;
+            stat.count = m1.count + m2.count;
+            return stat;
+        }
+
+        private void update(int value) {
+            this.min = Math.min(this.min, value);
+            this.max = Math.max(this.max, value);
+            this.sum += value;
+            this.count++;
+        }
+
+        @Override
+        public String toString() {
+            return round(min / 10.0) + "/" + round((sum / 10.0) / count) + "/" + round(max / 10.0);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+    private static final class ManagedComputation {
+        static <T> T compute(final Supplier<T> supplier) {
+            var managedBlocker = new ManagedSupplier<>(supplier);
+            try {
+                ForkJoinPool.managedBlock(managedBlocker);
+                return managedBlocker.getResult();
+            }
+            catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                throw new RuntimeException(e);
+            }
+
+        }
+
+        private static class ManagedSupplier<T> implements ForkJoinPool.ManagedBlocker {
+            private final Supplier<T> task;
+            private T result;
+            private boolean isDone = false;
+
+            private ManagedSupplier(final Supplier<T> supplier) {
+                task = supplier;
+            }
+
+            @Override
+            public boolean block() {
+                result = task.get();
+                isDone = true;
+                return true;
+            }
+
+            @Override
+            public boolean isReleasable() {
+                return isDone;
+            }
+
+            T getResult() {
+                return result;
+            }
+        }
+
+    }
+}

From f49a92019e817a25cbae7f8bb5f3f1d48693b841 Mon Sep 17 00:00:00 2001
From: karthikeyan97 <skarthikeyan046@gmail.com>
Date: Sun, 21 Jan 2024 01:19:54 +0530
Subject: [PATCH 081/268] using unsafe alone (#512)

* final comit

changing using mappedbytebuffer

changes before using unsafe address

using unsafe

* using graalvm,correct unsafe mem implementation

---------

Co-authored-by: Karthikeyans <karthikeyan.sn@zohocorp.com>
---
 calculate_average_karthikeyan97.sh            |  12 +-
 .../CalculateAverage_karthikeyan97.java       | 209 +++++++++---------
 2 files changed, 113 insertions(+), 108 deletions(-)

diff --git a/calculate_average_karthikeyan97.sh b/calculate_average_karthikeyan97.sh
index a6bd728d1..bbad1c4d0 100755
--- a/calculate_average_karthikeyan97.sh
+++ b/calculate_average_karthikeyan97.sh
@@ -16,4 +16,14 @@
 #
 
 JAVA_OPTS="-Xms20480m -Xmx40960m "
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_karthikeyan97
+
+if [ -f target/CalculateAverage_karthikeyan97_image ]; then
+    #echo "Picking up existing native image 'target/CalculateAverage_karthikeyan97_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_karthikeyan97_image -Xms20480m -Xmx32768m
+else
+    JAVA_OPTS="--enable-preview"
+    #echo "Chosing to run the app in JVM mode as no native image was found, use prepare_karthikeyan97.sh to generate." 1>&2
+    java -Xms20480m -Xmx32768m --enable-preview --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_karthikeyan97
+fi
+
+
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java b/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
index c17e92797..7014b120b 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
@@ -22,9 +22,12 @@
 import java.io.FileInputStream;
 
 import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
 import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -33,6 +36,7 @@
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Scanner;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.function.BiConsumer;
@@ -44,8 +48,21 @@
 
 public class CalculateAverage_karthikeyan97 {
 
+    private static final Unsafe UNSAFE = initUnsafe();
+
     private static final String FILE = "./measurements.txt";
 
+    private static Unsafe initUnsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
     private record Measurement(modifiedbytearray station, double value) {
     }
 
@@ -53,18 +70,18 @@ private record customPair(String stationName, MeasurementAggregator agg) {
     }
 
     private static class MeasurementAggregator {
-        private double min = Double.POSITIVE_INFINITY;
-        private double max = Double.NEGATIVE_INFINITY;
+        private long min = Long.MAX_VALUE;
+        private long max = Long.MIN_VALUE;
         private long sum;
         private long count;
 
         public String toString() {
             return new StringBuffer(14)
-                    .append(round(min))
+                    .append(round((1.0 * min)))
                     .append("/")
                     .append(round((1.0 * sum) / count))
                     .append("/")
-                    .append(round(max)).toString();
+                    .append(round((1.0 * max))).toString();
         }
 
         private double round(double value) {
@@ -74,7 +91,7 @@ private double round(double value) {
 
     public static void main(String[] args) throws Exception {
         // long start = System.nanoTime();
-        System.setSecurityManager(null);
+        // System.setSecurityManager(null);
         Collector<Map.Entry<modifiedbytearray, MeasurementAggregator>, MeasurementAggregator, MeasurementAggregator> collector = Collector.of(
                 MeasurementAggregator::new,
                 (a, m) -> {
@@ -103,15 +120,17 @@ public static void main(String[] args) throws Exception {
                 },
                 agg -> agg);
 
-        RandomAccessFile raf = new RandomAccessFile(FILE, "rw");
+        RandomAccessFile raf = new RandomAccessFile(FILE, "r");
+        FileChannel fileChannel = raf.getChannel();
+        final long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, raf.length(), Arena.global()).address();
         long length = raf.length();
-        int cores = length > 1000 ? Runtime.getRuntime().availableProcessors() : 1;
+        final long endAddress = mappedAddress + length - 1;
+        int cores = length > 1000 ? Runtime.getRuntime().availableProcessors() * 2 : 1;
         long boundary[][] = new long[cores][2];
         long segments = length / (cores);
         long before = -1;
         for (int i = 0; i < cores - 1; i++) {
             boundary[i][0] = before + 1;
-            byte[] b = new byte[107];
             if (before + segments - 107 > 0) {
                 raf.seek(before + segments - 107);
             }
@@ -130,120 +149,92 @@ public static void main(String[] args) throws Exception {
         f.setAccessible(true);
         Unsafe unsafe = (Unsafe) f.get(null);
 
-        int pageSize = unsafe.pageSize() * 10;
+        int l3Size = (13 * 1024 * 1024);// unsafe.l3Size();
 
         System.out.println(new TreeMap((Arrays.stream(boundary).parallel().map(i -> {
             FileInputStream fileInputStream = null;
             try {
-                fileInputStream = new FileInputStream(FILE);
-                FileChannel fileChannel = fileInputStream.getChannel();
-                HashMap<modifiedbytearray, MeasurementAggregator> resultmap = new HashMap<>(12000, 100);
-
-                ByteBuffer buffer = ByteBuffer.allocateDirect(pageSize);
-
-                fileChannel.position(i[0]);
-                int bytesReading = 0;
-                double num = 0;
+                int seglen = (int) (i[1] - i[0] + 1);
+                HashMap<modifiedbytearray, MeasurementAggregator> resultmap = new HashMap<>(1000);
+                long segstart = mappedAddress + i[0];
+                int bytesRemaining = seglen;
+                long num = 0;
                 int sign = 1;
                 boolean isNumber = false;
                 byte bi;
                 modifiedbytearray stationName = null;
-                int hascode = 1;
-                int ctr = 0;
-                byte[] arr = new byte[100];
-                int arrptr = 0;
-                int seglen = (int) (i[1] - i[0] + 1);
-                while (bytesReading < seglen) {
-                    buffer.clear();
-                    int bytesRead = fileChannel.read(buffer);
-                    if ((bytesReading + bytesRead) <= seglen) {
-                        if (bytesRead < 0) {
-                            bytesRead = 0;
-                        }
-                    }
-                    else {
-                        bytesRead = (seglen - bytesReading);
-                    }
-                    buffer.flip();
+                int hascode = 5381;
+                while (bytesRemaining > 0) {
                     int bytesptr = 0;
-                    byte[] bufferArr = new byte[bytesRead];
-                    buffer.get(bufferArr);
-                    while (bytesptr < bytesRead) {
-                        bytesReading += 1;
-                        bi = bufferArr[bytesptr++];
-                        if (ctr > 0) {
-                            arr[arrptr++] = bi;
-                            hascode = 31 * hascode + bi;
-                            ctr--;
-                        }
-                        else {
-                            if (bi >= 240) {
-                                arr[arrptr++] = bi;
-                                hascode = 31 * hascode + bi;
-                                ctr = 3;
-                            }
-                            else if (bi >= 224) {
-                                arr[arrptr++] = bi;
-                                hascode = 31 * hascode + bi;
-                                ctr = 2;
-                            }
-                            else if (bi >= 192) {
-                                arr[arrptr++] = bi;
-                                hascode = 31 * hascode + bi;
-                                ctr = 1;
+                    // int bytesread = buffer.remaining() > l3Size ? l3Size : buffer.remaining();
+                    // byte[] bufferArr = new byte[bytesread];
+                    // buffer.get(bufferArr);
+                    int bbstart = 0;
+                    int readSize = bytesRemaining > l3Size ? l3Size : bytesRemaining;
+                    int actualReadSize = (segstart + readSize + 110 > endAddress || readSize + 110 > i[1]) ? readSize : readSize + 110;
+                    byte[] readArr = new byte[actualReadSize];
+
+                    UNSAFE.copyMemory(null, segstart, readArr, UNSAFE.ARRAY_BYTE_BASE_OFFSET, actualReadSize);
+                    while (bytesptr < actualReadSize) {
+                        bi = readArr[bytesptr++];// UNSAFE.getByte(segstart + bytesReading++);
+                        if (!isNumber) {
+                            if (bi >= 192) {
+                                hascode = (hascode << 5) + hascode ^ bi;
                             }
                             else if (bi == 59) {
                                 isNumber = true;
-                                stationName = new modifiedbytearray(arr, arrptr, hascode);
-                                arr = new byte[100];
-                                arrptr = 0;
-                                hascode = 1;
-                            }
-                            else if (bi == 10) {
-                                hascode = 1;
-                                isNumber = false;
-                                MeasurementAggregator agg = resultmap.get(stationName);
-                                num *= sign;
-                                if (agg == null) {
-                                    agg = new MeasurementAggregator();
-                                    agg.min = num;
-                                    agg.max = num;
-                                    agg.sum = (long) (num);
-                                    agg.count = 1;
-                                    resultmap.put(stationName, agg);
+                                stationName = new modifiedbytearray(readArr, bbstart, bytesptr - 2, hascode & 0xFFFFFFFF);
+                                bbstart = 0;
+                                hascode = 5381;
+                                if (bytesptr >= readSize) {
+                                    break;
                                 }
-                                else {
-                                    if (agg.min >= num) {
+                            }
+                            else {
+                                hascode = (hascode << 5) + hascode ^ bi;
+                            }
+                        }
+                        else {
+                            switch (bi) {
+                                case 0x2E:
+                                    break;
+                                case 0x2D:
+                                    sign = -1;
+                                    break;
+                                case 10:
+                                    hascode = 5381;
+                                    isNumber = false;
+                                    bbstart = bytesptr;
+                                    MeasurementAggregator agg = resultmap.get(stationName);
+                                    num *= sign;
+                                    if (agg == null) {
+                                        agg = new MeasurementAggregator();
                                         agg.min = num;
-                                    }
-                                    if (agg.max <= num) {
                                         agg.max = num;
+                                        agg.sum = (long) (num);
+                                        agg.count = 1;
+                                        resultmap.put(stationName, agg);
                                     }
-                                    agg.sum += (long) (num);
-                                    agg.count++;
-                                }
-                                num = 0;
-                                sign = 1;
-                            }
-                            else {
-                                hascode = 31 * hascode + bi;
-                                if (isNumber) {
-                                    switch (bi) {
-                                        case 0x2E:
-                                            break;
-                                        case 0x2D:
-                                            sign = -1;
-                                            break;
-                                        default:
-                                            num = num * 10 + (bi - 0x30);
+                                    else {
+                                        if (agg.min >= num) {
+                                            agg.min = num;
+                                        }
+                                        if (agg.max <= num) {
+                                            agg.max = num;
+                                        }
+                                        agg.sum += (long) (num);
+                                        agg.count++;
                                     }
-                                }
-                                else {
-                                    arr[arrptr++] = bi;
-                                }
+                                    num = 0;
+                                    sign = 1;
+                                    break;
+                                default:
+                                    num = num * 10 + (bi - 0x30);
                             }
                         }
                     }
+                    bytesRemaining -= bytesptr;
+                    segstart += bytesptr;
                 }
                 /*
                  * while (bytesReading < (i[1] - i[0] + 1) && buffer.position() < buffer.limit()) {
@@ -335,7 +326,7 @@ public Object put(Object key, Object value) {
          */
         // Get the FileChannel from the FileInputStream
 
-        // System.out.println("time taken:" + (System.nanoTime() - start) / 1000000);
+        // System.out.println("time taken1:" + (System.nanoTime() - start) / 1000000);
         // System.out.println(measurements);
     }
 
@@ -343,17 +334,21 @@ public Object put(Object key, Object value) {
 
 class modifiedbytearray {
     private int length;
+    private int start;
+    private int end;
     private byte[] arr;
     public int hashcode;
 
-    modifiedbytearray(byte[] arr, int length, int hashcode) {
+    modifiedbytearray(byte[] arr, int start, int end, int hashcode) {
         this.arr = arr;
-        this.length = length;
+        this.length = end - start + 1;
+        this.end = end;
+        this.start = start;
         this.hashcode = hashcode;
     }
 
     public String getStationName() {
-        return new String(this.getArr(), 0, length, StandardCharsets.UTF_8);
+        return new String(this.getArr(), start, length, StandardCharsets.UTF_8);
     }
 
     public byte[] getArr() {
@@ -368,7 +363,7 @@ public String toString() {
     @Override
     public boolean equals(Object obj) {
         modifiedbytearray b = (modifiedbytearray) obj;
-        return Arrays.equals(this.getArr(), 0, length, b.arr, 0, b.length);
+        return Arrays.equals(this.getArr(), start, end, b.arr, b.start, b.end);
     }
 
     public int getHashcode() {

From f06de5faaba171e2c65ce503c45828c0d6aed32c Mon Sep 17 00:00:00 2001
From: Shivam Agarwal <24753115+0xshivamagarwal@users.noreply.github.com>
Date: Sun, 21 Jan 2024 01:24:04 +0530
Subject: [PATCH 082/268] Add 0xshivamagarwal Implementation (#508)

* 0xshivamagarwal implementation

* .

---------

Co-authored-by: Shivam Agarwal <>
---
 calculate_average_0xshivamagarwal.sh          |  22 +++
 .../CalculateAverage_0xshivamagarwal.java     | 137 ++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100755 calculate_average_0xshivamagarwal.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java

diff --git a/calculate_average_0xshivamagarwal.sh b/calculate_average_0xshivamagarwal.sh
new file mode 100755
index 000000000..32298fb31
--- /dev/null
+++ b/calculate_average_0xshivamagarwal.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
+if [[ ! "$(uname -s)" = "Darwin" ]]; then
+    JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+fi
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_0xshivamagarwal
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java
new file mode 100644
index 000000000..77a04bf4d
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import static java.lang.foreign.ValueLayout.JAVA_BYTE;
+import static java.nio.file.StandardOpenOption.READ;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public class CalculateAverage_0xshivamagarwal {
+    private static final Path FILE = Path.of("./measurements.txt");
+    private static final byte COLON = ';';
+    private static final byte NEW_LINE = '\n';
+    private static final byte HYPHEN = '-';
+    private static final byte DOT = '.';
+    private static final int NO_OF_THREADS = Runtime.getRuntime().availableProcessors();
+
+    private static long[] mergeFn(final long[] v1, final long[] v2) {
+        v1[0] = Math.min(v1[0], v2[0]);
+        v1[1] = Math.max(v1[1], v2[1]);
+        v1[2] += v2[2];
+        v1[3] += v2[3];
+        return v1;
+    }
+
+    private static String toString(final Map.Entry<String, long[]> entry) {
+        var m = entry.getValue();
+
+        return entry.getKey()
+                + '='
+                + m[0] / 10.0
+                + '/'
+                + Math.round(1.0 * m[2] / m[3]) / 10.0
+                + '/'
+                + m[1] / 10.0;
+    }
+
+    private static Map<String, long[]> parseData(
+                                                 final MemorySegment data, long offset, final long limit) {
+        var map = new HashMap<String, long[]>(10000, 1);
+        var sep = false;
+        var neg = false;
+        var key = new byte[100];
+        var len = 0;
+        var val = 0;
+
+        while (offset < limit) {
+            var b = data.get(JAVA_BYTE, offset++);
+            if (sep) {
+                if (b == NEW_LINE) {
+                    val = neg ? -val : val;
+                    map.merge(
+                            new String(key, 0, len),
+                            new long[]{ val, val, val, 1 },
+                            CalculateAverage_0xshivamagarwal::mergeFn);
+                    sep = false;
+                    neg = false;
+                    len = 0;
+                    val = 0;
+                }
+                else if (b == HYPHEN) {
+                    neg = true;
+                }
+                else if (b != DOT) {
+                    val = val * 10 + (b - 48);
+                }
+            }
+            else if (b == COLON) {
+                sep = true;
+            }
+            else {
+                key[len++] = b;
+            }
+        }
+
+        return map;
+    }
+
+    public static void main(String[] args) throws IOException {
+        final String result;
+
+        try (var channel = FileChannel.open(FILE, READ);
+                var arena = Arena.ofShared()) {
+            var data = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), arena);
+            var chunkSize = data.byteSize() / NO_OF_THREADS;
+            var chunks = new long[NO_OF_THREADS + 1];
+            chunks[NO_OF_THREADS] = data.byteSize();
+
+            for (int i = 1; i < NO_OF_THREADS; ++i) {
+                var chunkPos = i * chunkSize;
+
+                while (data.get(JAVA_BYTE, chunkPos++) != NEW_LINE) {
+                }
+
+                chunks[i] = chunkPos;
+            }
+
+            result = IntStream.range(0, NO_OF_THREADS)
+                    .mapToObj(i -> parseData(data, chunks[i], chunks[i + 1]))
+                    .parallel()
+                    .reduce(
+                            (m1, m2) -> {
+                                m2.forEach((k, v) -> m1.merge(k, v, CalculateAverage_0xshivamagarwal::mergeFn));
+                                return m1;
+                            })
+                    .map(
+                            map -> map.entrySet().parallelStream()
+                                    .sorted(Map.Entry.comparingByKey())
+                                    .map(CalculateAverage_0xshivamagarwal::toString)
+                                    .collect(Collectors.joining(", ", "{", "}")))
+                    .orElse(null);
+        }
+
+        System.out.println(result);
+    }
+}

From 2c1264def909a5256713c21d808cde2124327e23 Mon Sep 17 00:00:00 2001
From: giovannicuccu <giovanni.cuccu@gmail.com>
Date: Sat, 20 Jan 2024 21:01:43 +0100
Subject: [PATCH 083/268] Solution without unsafe (#507)

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
---
 calculate_average_giovannicuccu.sh            |  19 +
 prepare_giovannicuccu.sh                      |  20 +
 .../CalculateAverage_giovannicuccu.java       | 421 ++++++++++++++++++
 3 files changed, 460 insertions(+)
 create mode 100644 calculate_average_giovannicuccu.sh
 create mode 100644 prepare_giovannicuccu.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java

diff --git a/calculate_average_giovannicuccu.sh b/calculate_average_giovannicuccu.sh
new file mode 100644
index 000000000..314b5d8a7
--- /dev/null
+++ b/calculate_average_giovannicuccu.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_giovannicuccu
diff --git a/prepare_giovannicuccu.sh b/prepare_giovannicuccu.sh
new file mode 100644
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_giovannicuccu.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java b/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
new file mode 100644
index 000000000..7b549dc06
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
@@ -0,0 +1,421 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import static java.util.stream.Collectors.*;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.*;
+
+/*
+ Solution without unsafe that borrows the ideas of splullara, thomasvue, royvanrijn
+ */
+
+public class CalculateAverage_giovannicuccu {
+
+    private static final String FILE = "./measurements.txt";
+
+    public static record PartitionBoundary(long start, long end) {
+    }
+
+    public static interface PartitionCalculator {
+        PartitionBoundary[] computePartitionsBoundaries(Path path);
+    }
+
+    public static class ProcessorPartitionCalculator implements PartitionCalculator {
+
+        public PartitionBoundary[] computePartitionsBoundaries(Path path) {
+            try {
+                int numberOfSegments = Runtime.getRuntime().availableProcessors();
+                long fileSize = path.toFile().length();
+                long segmentSize = fileSize / numberOfSegments;
+                PartitionBoundary[] segmentBoundaries = new PartitionBoundary[numberOfSegments];
+                try (RandomAccessFile randomAccessFile = new RandomAccessFile(path.toFile(), "r")) {
+                    long segStart = 0;
+                    long segEnd = segmentSize;
+                    for (int i = 0; i < numberOfSegments; i++) {
+                        segEnd = findEndSegment(randomAccessFile, segEnd, fileSize);
+                        segmentBoundaries[i] = new PartitionBoundary(segStart, segEnd);
+                        segStart = segEnd;
+                        segEnd = Math.min(segEnd + segmentSize, fileSize);
+                    }
+                }
+                return segmentBoundaries;
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        private long findEndSegment(RandomAccessFile raf, long location, long fileSize) throws IOException {
+            raf.seek(location);
+            while (location < fileSize) {
+                location++;
+                if (raf.read() == 10)
+                    break;
+            }
+            return location;
+        }
+    }
+
+    public static class MeasurementAggregator {
+        private final int hash;
+        private int min;
+        private int max;
+        private double sum;
+        private long count;
+        private final byte[] station;
+        private final int offset;
+        private final String name;
+
+        private final long[] data;
+        private final int dataOffset;
+
+        public MeasurementAggregator(byte[] station, int offset, int hash, int initialValue, long[] data, int dataOffset) {
+            min = initialValue;
+            max = initialValue;
+            sum = initialValue;
+            count = 1;
+            this.station = station;
+            this.offset = offset;
+            this.hash = hash;
+            this.data = data;
+            this.dataOffset = dataOffset;
+            this.name = new String(station, 0, offset, StandardCharsets.UTF_8);
+        }
+
+        public MeasurementAggregator(byte[] station, int offset, int hash, int initialValue) {
+            min = initialValue;
+            max = initialValue;
+            sum = initialValue;
+            count = 1;
+            this.station = station;
+            this.offset = offset;
+            this.hash = hash;
+            this.data = new long[0];
+            this.dataOffset = 0;
+            this.name = new String(station, 0, offset, StandardCharsets.UTF_8);
+        }
+
+        public boolean hasSameStation(byte[] stationIn, int offsetIn) {
+            return Arrays.equals(stationIn, 0, offsetIn, station, 0, offset);
+        }
+
+        public boolean hasSameStation(long[] dataIn, int offsetIn) {
+            return Arrays.equals(dataIn, 0, offsetIn, data, 0, dataOffset);
+        }
+
+        public void add(int value) {
+            if (value < min) {
+                min = value;
+            }
+            if (value > max) {
+                max = value;
+            }
+            sum += value;
+            count++;
+        }
+
+        public void merge(MeasurementAggregator other) {
+            // System.out.println("min=" +min + " other min=" +other.min);
+            min = Math.min(min, other.min);
+            max = Math.max(max, other.max);
+            sum += other.sum;
+            count += other.count;
+        }
+
+        @Override
+        public String toString() {
+            return round((double) min / 10) + "/" + round((sum / (double) count) / 10) + "/" + round((double) max / 10);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        public int getMin() {
+            return min;
+        }
+
+        public int getHash() {
+            return hash;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public byte[] getStation() {
+            return station;
+        }
+
+        public int getOffset() {
+            return offset;
+        }
+
+        public long[] getData() {
+            return data;
+        }
+
+    }
+
+    public static class MeasurementList {
+
+        private static final int SIZE = 1024 * 64;
+        private final MeasurementAggregator[] measurements = new MeasurementAggregator[SIZE];
+
+        public void add(byte[] station, int offset, int hash, int value) {
+            int index = hash & (SIZE - 1);
+            if (measurements[index] == null) {
+                measurements[index] = new MeasurementAggregator(station.clone(), offset, hash, value);
+            }
+            else {
+                if (measurements[index].hasSameStation(station, offset)) {
+                    measurements[index].add(value);
+                }
+                else {
+                    while (measurements[index] != null && !measurements[index].hasSameStation(station, offset)) {
+                        index = (index + 1) & (SIZE - 1);
+                    }
+                    if (measurements[index] == null) {
+                        measurements[index] = new MeasurementAggregator(station.clone(), offset, hash, value);
+                    }
+                    else {
+                        measurements[index].add(value);
+                    }
+                }
+            }
+        }
+
+        public void merge(MeasurementAggregator measurementAggregator) {
+            int index = (measurementAggregator.getHash() & (SIZE - 1));
+            if (measurements[index] == null) {
+                measurements[index] = measurementAggregator;
+            }
+            else {
+                while (measurements[index] != null && !measurements[index].hasSameStation(measurementAggregator.getStation(), measurementAggregator.getOffset())) {
+                    index = (index + 1) & (SIZE - 1);
+                }
+                if (measurements[index] == null) {
+                    measurements[index] = measurementAggregator;
+                }
+                else {
+                    measurements[index].merge(measurementAggregator);
+                }
+            }
+        }
+
+        public MeasurementAggregator[] getMeasurements() {
+            return measurements;
+        }
+    }
+
+    public static class MMapReader {
+        private final Path path;
+        private final PartitionBoundary[] boundaries;
+
+        private final boolean serial;
+
+        public MMapReader(Path path, PartitionCalculator partitionCalculator, boolean serial) {
+            this.path = path;
+            this.serial = serial;
+            boundaries = partitionCalculator.computePartitionsBoundaries(path);
+        }
+
+        public TreeMap<String, MeasurementAggregator> elaborate() {
+            try (ExecutorService executor = Executors.newFixedThreadPool(boundaries.length)) {
+                List<Future<MeasurementList>> futures = new ArrayList<>();
+                for (PartitionBoundary boundary : boundaries) {
+                    if (serial) {
+                        FutureTask<MeasurementList> future = new FutureTask<>(() -> computeListForPartition(boundary.start(), boundary.end()));
+                        future.run();
+                        // System.out.println("done with partition " + boundary);
+                        futures.add(future);
+                    }
+                    else {
+                        Future<MeasurementList> future = executor.submit(() -> computeListForPartition(boundary.start(), boundary.end()));
+                        futures.add(future);
+                    }
+                }
+                TreeMap<String, MeasurementAggregator> ris = reduce(futures);
+                return ris;
+            }
+        }
+
+        private TreeMap<String, MeasurementAggregator> reduce(List<Future<MeasurementList>> futures) {
+            try {
+                TreeMap<String, MeasurementAggregator> risMap = new TreeMap<>();
+                MeasurementList ris = new MeasurementList();
+                for (Future<MeasurementList> future : futures) {
+                    MeasurementList results = future.get();
+                    merge(ris, results);
+                }
+                for (MeasurementAggregator m : ris.getMeasurements()) {
+                    if (m != null) {
+                        risMap.put(m.getName(), m);
+                    }
+                }
+                return risMap;
+            }
+            catch (InterruptedException | ExecutionException ie) {
+                System.err.println(ie);
+                throw new RuntimeException(ie);
+            }
+        }
+
+        private void merge(MeasurementList result, MeasurementList partial) {
+            for (MeasurementAggregator m : partial.getMeasurements()) {
+                if (m != null) {
+                    result.merge(m);
+                }
+            }
+        }
+
+        private MeasurementList computeListForPartition(long start, long end) {
+            MeasurementList list = new MeasurementList();
+            try {
+                try (FileChannel fileChannel = (FileChannel) Files.newByteChannel((path), StandardOpenOption.READ)) {
+                    MappedByteBuffer mappedByteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, end - start);
+                    mappedByteBuffer.order(BYTE_ORDER.LITTLE_ENDIAN);
+                    int limit = mappedByteBuffer.limit();
+                    int startLine;
+                    byte[] stationb = new byte[100];
+                    while ((startLine = mappedByteBuffer.position()) < limit - 110) {
+                        int currentPosition = startLine;
+                        byte b = 0;
+                        int i = 0;
+                        int hash = 0;
+
+                        while ((b = mappedByteBuffer.get(currentPosition++)) != ';') {
+                            stationb[i++] = b;
+                            hash = 31 * hash + b;
+                        }
+                        if (hash < 0) {
+                            hash = -hash;
+                        }
+
+                        long numberWord = mappedByteBuffer.getLong(currentPosition);
+                        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                        int value = convertIntoNumber(decimalSepPos, numberWord);
+                        mappedByteBuffer.position(currentPosition + (decimalSepPos >>> 3) + 3);
+
+                        list.add(stationb, i, hash, value);
+
+                    }
+                    while ((startLine = mappedByteBuffer.position()) < limit) {
+                        int currentPosition = startLine;
+                        byte b = 0;
+                        int i = 0;
+                        int hash = 0;
+                        while ((b = mappedByteBuffer.get(currentPosition++)) != ';') {
+                            stationb[i++] = b;
+                            hash = 31 * hash + b;
+                        }
+                        if (hash < 0) {
+                            hash = -hash;
+                        }
+
+                        int value = 0;
+                        if (currentPosition <= limit - 8) {
+                            long numberWord = mappedByteBuffer.getLong(currentPosition);
+                            int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                            value = convertIntoNumber(decimalSepPos, numberWord);
+                            mappedByteBuffer.position(currentPosition + (decimalSepPos >>> 3) + 3);
+                        }
+                        else {
+                            int sign = 1;
+                            b = mappedByteBuffer.get(currentPosition++);
+                            if (b == '-') {
+                                sign = -1;
+                            }
+                            else {
+                                value = b - '0';
+                            }
+                            while ((b = mappedByteBuffer.get(currentPosition++)) != '.') {
+                                value = value * 10 + (b - '0');
+                            }
+                            b = mappedByteBuffer.get(currentPosition);
+                            value = value * 10 + (b - '0');
+                            if (sign == -1) {
+                                value = -value;
+                            }
+                            mappedByteBuffer.position(currentPosition + 2);
+                        }
+
+                        list.add(stationb, i, hash, value);
+                    }
+                }
+            }
+            catch (IOException e) {
+                System.out.println("Error");
+                System.err.println(e);
+            }
+            return list;
+        }
+
+        private static final ByteOrder BYTE_ORDER = ByteOrder.nativeOrder();
+
+        private static long getLongLittleEndian(long value) {
+            value = Long.reverseBytes(value);
+            return value;
+        }
+
+        private static int convertIntoNumber(int decimalSepPos, long numberWord) {
+            int shift = 28 - decimalSepPos;
+            // signed is -1 if negative, 0 otherwise
+            long signed = (~numberWord << 59) >> 63;
+            long designMask = ~(signed & 0xFF);
+            // Align the number to a specific position and transform the ascii code
+            // to actual digit value in each byte
+            long digits = ((numberWord & designMask) << shift) & 0x0F000F0F00L;
+
+            // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+            // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+            // 0x000000UU00TTHH00 +
+            // 0x00UU00TTHH000000 * 10 +
+            // 0xUU00TTHH00000000 * 100
+            // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
+            // This results in our value lies in the bit 32 to 41 of this product
+            // That was close :)
+            long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            long value = (absValue ^ signed) - signed;
+            return (int) value;
+        }
+
+        private static long[] masks = new long[]{ 0x0000000000000000, 0xFF00000000000000L, 0xFFFF000000000000L,
+                0xFFFFFF0000000000L, 0xFFFFFFFF00000000L, 0xFFFFFFFFFF000000L, 0xFFFFFFFFFF0000L, 0xFFFFFFFFFFFF00L };
+
+    }
+
+    public static void main(String[] args) throws IOException {
+        long start = System.currentTimeMillis();
+        MMapReader reader = new MMapReader(Paths.get(FILE), new ProcessorPartitionCalculator(), false);
+        Map<String, MeasurementAggregator> measurements = reader.elaborate();
+        // System.out.println("ela=" + (System.currentTimeMillis() - start));
+        System.out.println(measurements);
+
+    }
+}

From 36ffed1315192130478b9291fe1dead03f5c0c58 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sat, 20 Jan 2024 21:02:06 +0100
Subject: [PATCH 084/268] Leaderboard update

---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7589efbad..87b7ee065 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ These are the results from running all entries into the challenge on eight cores
 | 3* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
 | 3*  | 00:02.571 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
 | 3* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
-|   | 00:02.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
+|   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
@@ -57,13 +57,13 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:04.365 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
+|   | 00:04.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
-|   | 00:05.283 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
@@ -75,7 +75,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
-|   | 00:07.202 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
+|   | 00:06.872 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
+|   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
+|   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:07.913 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [parkertimmins](https://github.com/parkertimmins) |  |
@@ -95,7 +97,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:09.945 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
 |   | 00:10.092 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-graal | [Pratham](https://github.com/phd3) |  |
 |   | 00:10.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
-|   | 00:10.553 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) |  |
 |   | 00:11.577 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
 |   | 00:11.119 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_lawrey.java)| 21.0.1-open | [lawrey](https://github.com/lawrey) |  |
@@ -104,12 +105,14 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
+|   | 00:11.878 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) |  |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
 |   | 00:12.051 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
+|   | 00:12.102 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java)| java | [Yann Moisan](https://github.com/YannMoisan) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
-|   | 00:12.582 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) |  |
+|   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
@@ -132,6 +135,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:20.691 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java)| 21.0.1-graal | [Kidlike](https://github.com/Kidlike) | GraalVM native binary |
 |   | 00:21.989 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_couragelee.java)| 21.0.1-open | [couragelee](https://github.com/couragelee) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
+|   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
 |   | 00:29.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java)| 21.0.1-open | [Matteo Vaccari](https://github.com/xpmatteo) |  |

From 0a7726cc643dc4068884af37a1280b5094454e25 Mon Sep 17 00:00:00 2001
From: adri <adria.cabezasantanna@datadoghq.com>
Date: Sat, 20 Jan 2024 21:27:34 +0100
Subject: [PATCH 085/268] Improving first iteration by avoiding string creation
 as much as possible (#516)

- It avoids creating unnecessary Strings objects and handles with the station names with its djb2 hashes instead
- Initializes hashmaps with capacity and load factor
- Adds -XX:+AlwaysPreTouch
---
 calculate_average_adriacabeza.sh              |  2 +-
 github_users.txt                              |  1 +
 .../onebrc/CalculateAverage_adriacabeza.java  | 85 ++++++++++++-------
 3 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/calculate_average_adriacabeza.sh b/calculate_average_adriacabeza.sh
index e2c655701..7d4be43d5 100755
--- a/calculate_average_adriacabeza.sh
+++ b/calculate_average_adriacabeza.sh
@@ -16,6 +16,6 @@
 #
 
 
-JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC"
+JAVA_OPTS="-XX:+UseStringDeduplication -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
 java --enable-preview  -classpath target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_adriacabeza
 
diff --git a/github_users.txt b/github_users.txt
index bd4726705..1e640d8e0 100644
--- a/github_users.txt
+++ b/github_users.txt
@@ -52,3 +52,4 @@ gnmathur;Gaurav Mathur
 vemana;Subrahmanyam
 jincongho;Jin Cong Ho
 yonatang;Yonatan Graber
+adriacabeza;Adrià Cabeza
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
index a1a6953dc..99936b235 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java
@@ -23,9 +23,13 @@
 import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
 import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.stream.Collectors;
 
 /**
@@ -35,11 +39,22 @@ public class CalculateAverage_adriacabeza {
 
     private static final Path FILE_PATH = Paths.get("./measurements.txt");
     public static final int CITY_NAME_MAX_CHARACTERS = 128;
+    private static final int N_PROCESSORS = Runtime.getRuntime().availableProcessors();
+    private static final int DJB2_INIT = 5381;
+    private static final Map<Integer, String> cityMap = new ConcurrentHashMap<>(10_000, 1, N_PROCESSORS);
 
     /**
      * Represents result containing a HashMap with city as key and ResultRow as value.
      */
     private static class Result {
+        public void addStation(int hash, int value) {
+            resultMap.put(hash, new StationData(value));
+        }
+
+        public StationData getData(int hash) {
+            return resultMap.get(hash);
+        }
+
         private static class StationData {
             private int min, sum, count, max;
 
@@ -63,28 +78,16 @@ public String toString() {
 
         }
 
-        private final Map<String, StationData> resultMap;
+        private final Map<Integer, StationData> resultMap;
 
         public Result() {
-            this.resultMap = new HashMap<>();
+            this.resultMap = new HashMap<>(10_000, 1);
         }
 
-        public Map<String, StationData> getResultMap() {
+        public Map<Integer, StationData> getResultMap() {
             return resultMap;
         }
 
-        public void addMeasurement(String city, int value) {
-            resultMap.compute(city, (_, resultRow) -> {
-                if (resultRow == null) {
-                    return new StationData(value);
-                }
-                else {
-                    resultRow.update(value);
-                    return resultRow;
-                }
-            });
-        }
-
         public void merge(Result other) {
             other.getResultMap().forEach((city, resultRow) -> resultMap.merge(city, resultRow, (existing, incoming) -> {
                 existing.min = Math.min(existing.min, incoming.min);
@@ -96,9 +99,9 @@ public void merge(Result other) {
         }
 
         public String toString() {
-            return this.resultMap.entrySet().stream()
-                    .sorted(Map.Entry.comparingByKey())
-                    .map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
+            return this.resultMap.entrySet().parallelStream()
+                    .map(entry -> "%s=%s".formatted(cityMap.get(entry.getKey()), entry.getValue()))
+                    .sorted(Comparator.comparing(s -> s.split("=")[0]))
                     .collect(Collectors.joining(", ", "{", "}"));
         }
     }
@@ -155,6 +158,21 @@ private static List<MappedByteBuffer> getMappedByteBuffers(int nProcessors) thro
         }
     }
 
+    private static int readNumberFromBuffer(ByteBuffer buffer, int limit) {
+        var number = 0;
+        var sign = 1;
+        while (buffer.position() < limit) {
+            var numberByte = buffer.get();
+            if (numberByte == '-')
+                sign = -1;
+            else if (numberByte == '\n')
+                break;
+            else if (numberByte != '.')
+                number = number * 10 + (numberByte - '0');
+        }
+        return sign * number;
+    }
+
     /**
      * Calculates average measurements from the file.
      *
@@ -167,28 +185,31 @@ private static Result calculateAverageMeasurements(List<MappedByteBuffer> chunks
                     Result partialResult = new Result();
                     var limit = buffer.limit();
                     var field = new byte[CITY_NAME_MAX_CHARACTERS];
+                    Set<Integer> seenHashes = new HashSet<>(10_000, 1);
                     while (buffer.position() < limit) {
                         var fieldCurrentIndex = 0;
-                        field[fieldCurrentIndex++] = buffer.get();
+                        var fieldByte = buffer.get();
+                        field[fieldCurrentIndex++] = fieldByte;
+                        // implement djb2 hash: https://theartincode.stanis.me/008-djb2/
+                        int hash = DJB2_INIT;
                         while (buffer.position() < limit) {
-                            var fieldByte = buffer.get();
+                            // hash = hash * 33 + fieldByte
+                            hash = (((hash << 5) + hash) + fieldByte);
+                            fieldByte = buffer.get();
                             if (fieldByte == ';')
                                 break;
                             field[fieldCurrentIndex++] = fieldByte;
                         }
-                        var fieldStr = new String(field, 0, fieldCurrentIndex);
-                        var number = 0;
-                        var sign = 1;
-                        while (buffer.position() < limit) {
-                            var numberByte = buffer.get();
-                            if (numberByte == '-')
-                                sign = -1;
-                            else if (numberByte == '\n')
-                                break;
-                            else if (numberByte != '.')
-                                number = number * 10 + (numberByte - '0');
+
+                        var number = readNumberFromBuffer(buffer, limit);
+                        if (!seenHashes.contains(hash)) {
+                            seenHashes.add(hash);
+                            cityMap.put(hash, new String(field, 0, fieldCurrentIndex));
+                            partialResult.addStation(hash, number);
+                        }
+                        else {
+                            partialResult.getData(hash).update(number);
                         }
-                        partialResult.addMeasurement(fieldStr, sign * number);
                     }
                     return partialResult;
                 }).reduce(new Result(), (partialResult1, partialResult2) -> {

From 8ba67cbc6d4d83432bac28e453efc7bf3a963c10 Mon Sep 17 00:00:00 2001
From: kumarsaurav123 <kumar.saurav@eko.co.in>
Date: Sun, 21 Jan 2024 17:20:36 +0530
Subject: [PATCH 086/268] Use Array to store results instead of grouping by and
 custom class (#522)

---
 .../CalculateAverage_kumarsaurav123.java      | 271 +++++++++---------
 1 file changed, 141 insertions(+), 130 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java b/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java
index f991f9f8b..87458d1d3 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java
@@ -23,132 +23,108 @@
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.util.*;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.locks.ReentrantLock;
 import java.util.stream.Collector;
+import java.util.stream.Collectors;
 
 import static java.util.stream.Collectors.groupingBy;
 
 public class CalculateAverage_kumarsaurav123 {
 
     private static final String FILE = "./measurements.txt";
+    private static AtomicInteger indexCount = new AtomicInteger(0);
+    private static final ReentrantLock lock = new ReentrantLock();
+    private static final int MAX_UNIQUE_KEYS = 11000;
+    private static Map<StringHolder, Integer> indexMap;
 
-    private static record Measurement(String station, double value) {
-        private Measurement(String[] parts) {
-            this(parts[0], Double.parseDouble(parts[1]));
-        }
-    }
-
-    private static record Pair(long start, int size) {
-    }
+    private static record Store(double[] min, double[] max, double[] sum,
+                                int[] count) {
 
-    private static record ResultRow(String station, double min, double mean, double max, double sum, double count) {
-        public String toString() {
-            return round(min) + "/" + round(mean) + "/" + round(max);
-        }
 
         private double round(double value) {
             return Math.round(value * 10.0) / 10.0;
         }
-    }
-
-    ;
 
-    private static class MeasurementAggregator {
-        private double min = Double.POSITIVE_INFINITY;
-        private double max = Double.NEGATIVE_INFINITY;
-        private double sum;
-        private long count;
+        @Override
+        public String toString() {
+            return new TreeMap<>(indexMap.entrySet()
+                    .stream()
+                    .map(e -> Map.entry(e.getKey().toString(),
+                            round(min[e.getValue()]) + "/" + round((Math.round(sum[e.getValue()] * 10.0) / 10.0) / count[e.getValue()]) + "/" + round(max[e.getValue()])
+                    ))
+                    .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))).toString();
+        }
+    }
 
-        private String station;
+    private static record Pair(long start, int size) {
     }
 
-    public static void main(String[] args) throws IOException {
+    public static void main(String[] args) throws IOException, ExecutionException, InterruptedException {
         long start = System.currentTimeMillis();
         System.out.println(run(FILE));
-        // System.out.println(System.currentTimeMillis() - start);
     }
 
-    public static String run(String filePath) throws IOException {
-        Collector<ResultRow, MeasurementAggregator, ResultRow> collector2 = Collector.of(
-                MeasurementAggregator::new,
-                (a, m) -> {
-                    a.min = Math.min(a.min, m.min);
-                    a.max = Math.max(a.max, m.max);
-                    a.sum += m.sum;
-                    a.count += m.count;
-                },
-                (agg1, agg2) -> {
-                    var res = new MeasurementAggregator();
-                    res.min = Math.min(agg1.min, agg2.min);
-                    res.max = Math.max(agg1.max, agg2.max);
-                    res.sum = agg1.sum + agg2.sum;
-                    res.count = agg1.count + agg2.count;
-
-                    return res;
-                },
-                agg -> {
-                    return new ResultRow(agg.station, agg.min, (Math.round(agg.sum * 10.0) / 10.0) / agg.count, agg.max, agg.sum, agg.count);
-                });
-        Collector<Measurement, MeasurementAggregator, ResultRow> collector = Collector.of(
-                MeasurementAggregator::new,
-                (a, m) -> {
-                    a.min = Math.min(a.min, m.value);
-                    a.max = Math.max(a.max, m.value);
-                    a.sum += m.value;
-                    a.station = m.station;
-                    a.count++;
-                },
-                (agg1, agg2) -> {
-                    var res = new MeasurementAggregator();
-                    res.min = Math.min(agg1.min, agg2.min);
-                    res.max = Math.max(agg1.max, agg2.max);
-                    res.sum = agg1.sum + agg2.sum;
-                    res.count = agg1.count + agg2.count;
-
-                    return res;
-                },
-                agg -> {
-                    return new ResultRow(agg.station, agg.min, agg.sum / agg.count, agg.max, agg.sum, agg.count);
-                });
+    public static String run(String filePath) throws IOException, InterruptedException, ExecutionException {
+        indexCount = new AtomicInteger(0);
+        indexMap = new HashMap<>(MAX_UNIQUE_KEYS);
         ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2);
-        List<ResultRow> measurements = Collections.synchronizedList(new ArrayList<ResultRow>());
-        int chunkSize = 1_0000_00;
+        CompletionService<Store> completionService = new ExecutorCompletionService<>(executorService);
         Map<Integer, List<byte[]>> leftOutsMap = new ConcurrentSkipListMap<>();
         RandomAccessFile file = new RandomAccessFile(filePath, "r");
         long filelength = file.length();
         AtomicInteger kk = new AtomicInteger();
-        MemorySegment memorySegment = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, filelength, Arena.global());
+        MemorySegment memorySegment = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, filelength, Arena.ofShared());
         int nChunks = 1000;
 
-        int pChunkSize = Math.min(Integer.MAX_VALUE, (int) (memorySegment.byteSize() / (1000 * 20)));
+        int pChunkSize = Math.min(Integer.MAX_VALUE, (int) (memorySegment.byteSize() / (1000)));
         if (pChunkSize < 100) {
             pChunkSize = (int) memorySegment.byteSize();
             nChunks = 1;
         }
         ArrayList<Pair> chunks = createStartAndEnd(pChunkSize, nChunks, memorySegment);
         chunks.stream()
+                .parallel()
                 .map(p -> {
 
-                    return createRunnable(memorySegment, p, collector, measurements, kk.getAndIncrement());
+                    return createRunnable(memorySegment, p);
                 })
-                .forEach(executorService::submit);
+                .forEach(completionService::submit);
         executorService.shutdown();
-        try {
-            executorService.awaitTermination(10, TimeUnit.MINUTES);
-        }
-        catch (InterruptedException e) {
-            throw new RuntimeException(e);
+        int i = 0;
+        double[] min = new double[MAX_UNIQUE_KEYS];
+        double[] max = new double[MAX_UNIQUE_KEYS];
+        double[] sum = new double[MAX_UNIQUE_KEYS];
+        int[] count = new int[MAX_UNIQUE_KEYS];
+        initArray(i, count, min, max, sum);
+        i = 0;
+        final Store cureentStore = new Store(min, max, sum, count);
+        while (i < chunks.size()) {
+            Store newStore = completionService.take().get();
+            Map<Integer, StringHolder> reverseMap = indexMap.entrySet()
+                    .stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
+            reverseMap.forEach((key, value) -> {
+                cureentStore.sum[key] += newStore.sum[key];
+                cureentStore.count[key] += newStore.count[key];
+                cureentStore.min[key] = Math.min(cureentStore.min[key],
+                        newStore.min[key]);
+                cureentStore.max[key] = Math.max(cureentStore.max[key],
+                        newStore.max[key]);
+            });
+            i++;
         }
 
-        Map<String, ResultRow> measurements2 = new TreeMap<>(measurements
-                .stream()
-                .parallel()
-                .collect(groupingBy(ResultRow::station, collector2)));
-        return measurements2.toString();
+        return cureentStore.toString();
+    }
+
+    private static void initArray(int i, int[] count, double[] min, double[] max, double[] sum) {
+        for (; i < count.length; i++) {
+            min[i] = Double.POSITIVE_INFINITY;
+            max[i] = Double.NEGATIVE_INFINITY;
+            sum[i] = 0.0d;
+            count[i] = 0;
+        }
     }
 
     private static ArrayList<Pair> createStartAndEnd(int chunksize, int nChunks, MemorySegment memorySegment) {
@@ -174,41 +150,30 @@ private static ArrayList<Pair> createStartAndEnd(int chunksize, int nChunks, Mem
         return startSizePairs;
     }
 
-    public static Runnable createRunnable(MemorySegment memorySegment, Pair p, Collector<Measurement, MeasurementAggregator, ResultRow> collector,
-                                          List<ResultRow> measurements, int kk) {
-        return new Runnable() {
+    public static Callable<Store> createRunnable(MemorySegment memorySegment, Pair p) {
+        return new Callable<Store>() {
             @Override
-            public void run() {
+            public Store call() {
                 try {
-                    long start = System.currentTimeMillis();
-
-                    byte[] allBytes2 = new byte[p.size];
-                    MemorySegment lMemory = memorySegment.asSlice(p.start, p.size);
-                    lMemory.asByteBuffer().get(allBytes2);
-                    HashMap<Byte, Integer> map = new HashMap<>();
-                    // Runtime runtime = Runtime.getRuntime();
-                    // long memoryMax = runtime.maxMemory();
-                    // long memoryUsed = runtime.totalMemory() - runtime.freeMemory();
-                    // double memoryUsedPercent = (memoryUsed * 100.0) / memoryMax;
-                    // System.out.println("memoryUsedPercent: " + memoryUsedPercent);
-                    map.put((byte) 48, 0);
-                    map.put((byte) 49, 1);
-                    map.put((byte) 50, 2);
-                    map.put((byte) 51, 3);
-                    map.put((byte) 52, 4);
-                    map.put((byte) 53, 5);
-                    map.put((byte) 54, 6);
-                    map.put((byte) 55, 7);
-                    map.put((byte) 56, 8);
-                    map.put((byte) 57, 9);
+                    double[] min = new double[MAX_UNIQUE_KEYS];
+                    double[] max = new double[MAX_UNIQUE_KEYS];
+                    double[] sum = new double[MAX_UNIQUE_KEYS];
+                    int[] count = new int[MAX_UNIQUE_KEYS];
+                    for (int i = 0; i < count.length; i++) {
+                        min[i] = Double.POSITIVE_INFINITY;
+                        max[i] = Double.NEGATIVE_INFINITY;
+                        sum[i] = 0.0d;
+                        count[i] = 0;
+                    }
+
+                    byte[] allBytes2 = memorySegment.asSlice(p.start, p.size).toArray(ValueLayout.JAVA_BYTE);
                     byte[] eol = "\n".getBytes(StandardCharsets.UTF_8);
                     byte[] sep = ";".getBytes(StandardCharsets.UTF_8);
 
-                    List<Measurement> mst = new ArrayList<>();
                     int st = 0;
-
                     for (int i = 0; i < allBytes2.length; i++) {
                         if (allBytes2[i] == eol[0]) {
+                            ;
                             byte[] s2 = new byte[i - st];
                             System.arraycopy(allBytes2, st, s2, 0, s2.length);
                             for (int j = 0; j < s2.length; j++) {
@@ -217,37 +182,83 @@ public void run() {
                                     byte[] value = new byte[s2.length - j - 1];
                                     System.arraycopy(s2, 0, city, 0, city.length);
                                     System.arraycopy(s2, city.length + 1, value, 0, value.length);
-                                    double d = 0.0;
-                                    int s = -1;
-                                    for (int k = value.length - 1; k >= 0; k--) {
-                                        if (value[k] == 45) {
-                                            d = d * -1;
-                                        }
-                                        else if (value[k] == 46) {
-                                        }
-                                        else {
-                                            d = d + map.get(value[k]).intValue() * Math.pow(10, s);
-                                            s++;
+                                    double d = getaDouble(value);
+                                    StringHolder citys = new StringHolder(city);
+                                    Integer index = indexMap.get(citys);
+                                    if (Objects.isNull(index)) {
+                                        lock.lock();
+                                        if (Objects.isNull(indexMap.get(citys))) {
+                                            index = indexCount.getAndIncrement();
+                                            indexMap.putIfAbsent(citys, index);
+
                                         }
+                                        index = indexMap.get(citys);
+                                        lock.unlock();
                                     }
-                                    mst.add(new Measurement(new String(city), d));
 
+                                    count[index] = count[index] + 1;
+                                    max[index] = Math.max(max[index], d);
+                                    min[index] = Math.min(min[index], d);
+                                    sum[index] = Double.sum(sum[index], d);
+                                    break;
                                 }
                             }
                             st = i + 1;
                         }
                     }
-                    // System.out.println("Task " + kk + "Completed in " + (System.currentTimeMillis() - start));
-                    measurements.addAll(mst.stream()
-                            .collect(groupingBy(Measurement::station, collector))
-                            .values());
-
+                    // System.out.println("Task " + kk + "Completed in " + (System.nanoTime() - start));
+                    return new Store(min, max, sum, count);
                 }
                 catch (Exception e) {
                     // throw new RuntimeException(e);
-                    System.out.println("");
+                    throw e;
                 }
             }
         };
     }
+
+    private static double getaDouble(byte[] value) {
+        double d = 0.0;
+        int s = -1;
+        for (int k = value.length - 1; k >= 0; k--) {
+            if (value[k] == 45) {
+                d = d * -1;
+            }
+            else if (value[k] == 46) {
+            }
+            else {
+                d = d + (((int) value[k]) - 48) * Math.pow(10, s);
+                s++;
+            }
+        }
+        return d;
+    }
+
+    static class StringHolder implements Comparable<StringHolder> {
+        byte[] bytes;
+
+        public StringHolder(byte[] bytes) {
+            this.bytes = bytes;
+        }
+
+        @Override
+        public String toString() {
+            return new String(this.bytes);
+        }
+
+        @Override
+        public int hashCode() {
+            return Arrays.hashCode(this.bytes);
+        }
+
+        @Override
+        public boolean equals(Object obj) {
+            return Arrays.equals(this.bytes, ((StringHolder) obj).bytes);
+        }
+
+        @Override
+        public int compareTo(StringHolder o) {
+            return new String(this.bytes).compareTo(new String(o.bytes));
+        }
+    }
 }

From 410425c833f0652b174af2741ab00963974c95fe Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Sun, 21 Jan 2024 21:25:18 +0900
Subject: [PATCH 087/268] reorganize code, little bit faster (#509)

---
 prepare_abeobk.sh                             |  2 +-
 .../onebrc/CalculateAverage_abeobk.java       | 51 ++++++++++---------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/prepare_abeobk.sh b/prepare_abeobk.sh
index bf2b7b51e..fac7b87b0 100755
--- a/prepare_abeobk.sh
+++ b/prepare_abeobk.sh
@@ -20,6 +20,6 @@ sdk use java 21.0.1-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_abeobk_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=128m --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index cdc2c1e38..48d9da687 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -47,6 +47,10 @@ public class CalculateAverage_abeobk {
             0xffffffffffffffL,
             0xffffffffffffffffL, };
 
+    private static final void debug(String s, Object... args) {
+        System.out.println(String.format(s, args));
+    }
+
     private static Unsafe initUnsafe() {
         try {
             Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
@@ -110,7 +114,7 @@ void merge(Node other) {
         }
 
         boolean contentEquals(long other_addr, long other_tail) {
-            if (tail != other_tail) // compare tail & length at the same time
+            if (tail != other_tail)
                 return false;
             // this is faster than comparision if key is short
             long xsum = 0;
@@ -182,13 +186,15 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
             // about 50% chance key < 8 chars
             if (semipos_code != 0) {
                 int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos;
+                addr += semi_pos + 1;
+                long num_word = UNSAFE.getLong(addr);
+                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                addr += (dot_pos >>> 3) + 3;
+
                 tail = (word0 & HASH_MASKS[semi_pos]);
                 bucket = xxh32(tail) & BUCKET_MASK;
-                long num_word = UNSAFE.getLong(++addr);
-                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 val = parseNum(num_word, dot_pos);
-                addr += (dot_pos >>> 3) + 3;
+
                 while (true) {
                     var node = map[bucket];
                     if (node == null) {
@@ -214,14 +220,15 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
             if (semipos_code != 0) {
                 int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
                 addr += semi_pos;
+                int keylen = (int) (addr - row_addr);
+                long num_word = UNSAFE.getLong(addr + 1);
+                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                addr += (dot_pos >>> 3) + 4;
+
                 tail = (word & HASH_MASKS[semi_pos]);
                 hash ^= tail;
                 bucket = xxh32(hash) & BUCKET_MASK;
-                int keylen = (int) (addr - row_addr);
-                long num_word = UNSAFE.getLong(++addr);
-                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 val = parseNum(num_word, dot_pos);
-                addr += (dot_pos >>> 3) + 3;
 
                 while (true) {
                     var node = map[bucket];
@@ -249,16 +256,15 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
 
             int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
             addr += semi_pos;
+            int keylen = (int) (addr - row_addr);
+            long num_word = UNSAFE.getLong(addr + 1);
+            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+            addr += (dot_pos >>> 3) + 4;
+
             tail = (word & HASH_MASKS[semi_pos]);
             hash ^= tail;
             bucket = xxh32(hash) & BUCKET_MASK;
-            int keylen = (int) (addr - row_addr);
-
-            long num_word = UNSAFE.getLong(++addr);
-
-            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
             val = parseNum(num_word, dot_pos);
-            addr += (dot_pos >>> 3) + 3;
 
             while (true) {
                 var node = map[bucket];
@@ -307,12 +313,6 @@ public static void main(String[] args) throws InterruptedException, IOException
             for (var thread : threads)
                 thread.join();
 
-            if (SHOW_ANALYSIS) {
-                for (int i = 0; i < cpu_cnt; i++) {
-                    System.out.println("thread-" + i + " collision = " + cls[i]);
-                }
-            }
-
             // collect results
             TreeMap<String, Node> ms = new TreeMap<>();
             for (var map : maps) {
@@ -330,13 +330,16 @@ public static void main(String[] args) throws InterruptedException, IOException
             }
 
             if (SHOW_ANALYSIS) {
-                System.out.println("total=" + Arrays.stream(lenhist).sum());
-                System.out.println("length_histogram = "
+                debug("Collision stat: ");
+                for (int i = 0; i < cpu_cnt; i++) {
+                    debug("thread-" + i + " collision = " + cls[i]);
+                }
+                debug("Total = " + Arrays.stream(lenhist).sum());
+                debug("Length_histogram = "
                         + Arrays.toString(Arrays.stream(lenhist).map(x -> (int) (x * 1.0e-7)).toArray()));
             }
             else
                 System.out.println(ms);
         }
     }
-
 }
\ No newline at end of file

From 38c3e0f1eec57dbf7c49560a735b30de351dca6d Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 21 Jan 2024 13:43:40 +0100
Subject: [PATCH 088/268] Leaderboard update

---
 README.md                          | 12 ++++++------
 calculate_average_giovannicuccu.sh |  0
 prepare_giovannicuccu.sh           |  0
 3 files changed, 6 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 calculate_average_giovannicuccu.sh
 mode change 100644 => 100755 prepare_giovannicuccu.sh

diff --git a/README.md b/README.md
index 87b7ee065..0101ac2f1 100644
--- a/README.md
+++ b/README.md
@@ -41,10 +41,10 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1* | 00:02.461 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
-| 1* | 00:02.477 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
+| 1 | 00:02.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
+| 2 | 00:02.461 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
 | 3* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
-| 3*  | 00:02.571 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+| 3* | 00:02.571 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
 | 3* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
 |   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
@@ -57,7 +57,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:04.365 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
-|   | 00:04.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
+|   | 00:04.627 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
@@ -86,6 +86,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
 |   | 00:08.489 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnabyl.java)| 21.0.1-graal | [Bang NGUYEN](https://github.com/gnabyl) |  |
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) |  |
+|   | 00:08.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [Adrià Cabeza](https://github.com/adriacabeza) |  |
 |   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
 |   | 00:08.689 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:08.752 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
@@ -101,7 +102,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
 |   | 00:11.119 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_lawrey.java)| 21.0.1-open | [lawrey](https://github.com/lawrey) |  |
 |   | 00:11.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_palmr.java)| 21.0.1-open | [Nick Palmer](https://github.com/palmr) |  |
-|   | 00:11.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [adri](https://github.com/adriacabeza) |  |
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
@@ -136,6 +136,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:21.989 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_couragelee.java)| 21.0.1-open | [couragelee](https://github.com/couragelee) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
 |   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
+|   | 00:24.986 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
 |   | 00:29.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java)| 21.0.1-open | [Matteo Vaccari](https://github.com/xpmatteo) |  |
@@ -143,7 +144,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |
 |   | 00:35.875 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java)| 21.0.1-open | [MahmoudFawzyKhalil](https://github.com/MahmoudFawzyKhalil) |  |
 |   | 00:36.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hchiorean.java)| 21.0.1-open | [Horia Chiorean](https://github.com/hchiorean) |  |
-|   | 00:36.991 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:38.340 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_AbstractKamen.java)| 21.0.1-open | [AbstractKamen](https://github.com/AbstractKamen) |  |
 |   | 00:41.982 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_criccomini.java)| 21.0.1-open | [Chris Riccomini](https://github.com/criccomini) |  |
 |   | 00:42.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_javamak.java)| 21.0.1-open | [javamak](https://github.com/javamak) |  |
diff --git a/calculate_average_giovannicuccu.sh b/calculate_average_giovannicuccu.sh
old mode 100644
new mode 100755
diff --git a/prepare_giovannicuccu.sh b/prepare_giovannicuccu.sh
old mode 100644
new mode 100755

From 3e1951379a6014ae8b711657786d0bf3eb725b0e Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 21 Jan 2024 17:07:43 +0100
Subject: [PATCH 089/268] #104 Running tests for PRs

---
 .github/workflows/maven.yml |  16 +++++
 .sdkmanrc                   |   3 +
 pom.xml                     |   1 +
 test_ci.sh                  | 124 ++++++++++++++++++++++++++++++++++++
 4 files changed, 144 insertions(+)
 create mode 100644 .sdkmanrc
 create mode 100755 test_ci.sh

diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index b5f09651c..9c0bc3b85 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -45,5 +45,21 @@ jobs:
           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
           restore-keys: ${{ runner.os }}-m2
 
+      #- name: Cache SDKMan
+      #  id: cache-sdkman
+      #  uses: actions/cache@v4
+      #  with:
+      #    path: ~/.sdkman
+      #    key: ${{ runner.os }}-sdkman
+
       - name: 'Build project'
         run: mvn -B clean verify -Pci
+
+      - name: 'Setup SDKMAN'
+        uses: sdkman/sdkman-action@b1f9b696c79148b66d3d3a06f7ea801820318d0f
+        id: sdkman
+        if: github.event_name == 'pull_request'
+
+      - name: 'Test submission'
+        run: ./test_ci.sh ${{ github.event.pull_request.user.login }}
+        if: github.event_name == 'pull_request'
diff --git a/.sdkmanrc b/.sdkmanrc
new file mode 100644
index 000000000..dfb233bfa
--- /dev/null
+++ b/.sdkmanrc
@@ -0,0 +1,3 @@
+# Enable auto-env through the sdkman_auto_env config
+# Add key=value pairs of SDKs to use below
+java=21.0.1-open
diff --git a/pom.xml b/pom.xml
index 51226a3d6..a2c91e3bd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -211,6 +211,7 @@
                 <exclude>github_users.txt</exclude>
                 <!-- Cliff asked to be named as the copyright holder for his entry; -->
                 <exclude>src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java</exclude>
+                <exclude>.sdkmanrc</exclude>
               </excludes>
             </configuration>
             <executions>
diff --git a/test_ci.sh b/test_ci.sh
new file mode 100755
index 000000000..1f399e66e
--- /dev/null
+++ b/test_ci.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+set -eo pipefail
+
+if [ -z "$1" ]
+  then
+    echo "Usage: test_ci.sh <fork name> (<fork name 2> ...)"
+    echo " for each fork, there must be a 'calculate_average_<fork name>.sh' script and an optional 'prepare_<fork name>.sh'."
+    exit 1
+fi
+
+BOLD_WHITE='\033[1;37m'
+CYAN='\033[0;36m'
+GREEN='\033[0;32m'
+PURPLE='\033[0;35m'
+BOLD_RED='\033[1;31m'
+RED='\033[0;31m'
+BOLD_YELLOW='\033[1;33m'
+RESET='\033[0m' # No Color
+
+MEASUREMENTS_FILE="measurements_10M.txt"
+RUNS=5
+DEFAULT_JAVA_VERSION="21.0.1-open"
+RUN_TIME_LIMIT=300 # seconds
+
+TIMEOUT=""
+if [ "$(uname -s)" == "Linux" ]; then
+  TIMEOUT="timeout -v $RUN_TIME_LIMIT"
+else # MacOs
+  if [ -x "$(command -v gtimeout)" ]; then
+    TIMEOUT="gtimeout -v $RUN_TIME_LIMIT" # from `brew install coreutils`
+  else
+    echo -e "${BOLD_YELLOW}WARNING${RESET} gtimeout not available, benchmark runs may take indefinitely long."
+  fi
+fi
+
+function check_command_installed {
+  if ! [ -x "$(command -v $1)" ]; then
+    echo "Error: $1 is not installed." >&2
+    exit 1
+  fi
+}
+
+function print_and_execute() {
+  echo "+ $@" >&2
+  "$@"
+}
+
+check_command_installed java
+
+# Validate that ./calculate_average_<fork>.sh exists for each fork
+for fork in "$@"; do
+  if [ ! -f "./calculate_average_$fork.sh" ]; then
+    echo -e "${BOLD_RED}ERROR${RESET}: ./calculate_average_$fork.sh does not exist." >&2
+    exit 1
+  fi
+done
+
+## SDKMAN Setup
+# 1. Custom check for sdkman installed; not sure why check_command_installed doesn't detect it properly
+if [ ! -f "$HOME/.sdkman/bin/sdkman-init.sh" ]; then
+     echo -e "${BOLD_RED}ERROR${RESET}: sdkman is not installed." >&2
+    exit 1
+fi
+
+# 2. Init sdkman in this script
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+
+# 3. make sure the default java version is installed
+if [ ! -d "$HOME/.sdkman/candidates/java/$DEFAULT_JAVA_VERSION" ]; then
+  print_and_execute sdk install java $DEFAULT_JAVA_VERSION
+fi
+
+# 4. Install missing SDK java versions in any of the prepare_*.sh scripts for the provided forks
+for fork in "$@"; do
+  if [ -f "./prepare_$fork.sh" ]; then
+    grep -h "^sdk use" "./prepare_$fork.sh" | cut -d' ' -f4 | while read -r version; do
+      if [ ! -d "$HOME/.sdkman/candidates/java/$version" ]; then
+        print_and_execute sdk install java $version
+      fi
+    done || true # grep returns exit code 1 when no match, `|| true` prevents the script from exiting early
+  fi
+done
+## END - SDKMAN Setup
+
+# Run tests and benchmark for each fork
+filetimestamp=$(date  +"%Y%m%d%H%M%S") # same for all fork.out files from this run
+failed=()
+for fork in "$@"; do
+  set +e # we don't want prepare.sh, test.sh or hyperfine failing on 1 fork to exit the script early
+
+  # Run prepare script
+  if [ -f "./prepare_$fork.sh" ]; then
+    print_and_execute source "./prepare_$fork.sh"
+  else
+    print_and_execute sdk use java $DEFAULT_JAVA_VERSION
+  fi
+
+  # Run the test suite
+  print_and_execute $TIMEOUT ./test.sh $fork
+  if [ $? -ne 0 ]; then
+    failed+=("$fork")
+    echo ""
+    echo -e "${BOLD_RED}FAILURE${RESET}: ./test.sh $fork failed"
+
+    exit 1
+  fi
+  echo ""
+done

From 7bfc7eaec67d35ae8f13f4fe858889420a2d972e Mon Sep 17 00:00:00 2001
From: Roman Musin <995612+roman-r-m@users.noreply.github.com>
Date: Sun, 21 Jan 2024 17:01:23 +0000
Subject: [PATCH 090/268] Reduce allocations and heap size (#525)

* Reduce allocations

* Shrink the heap size

* Calculate hash when reading name  (50-100ms difference)

* no need to reverse bytes

* bump heap size
---
 calculate_average_roman-r-m.sh                |  3 +-
 .../onebrc/CalculateAverage_roman_r_m.java    | 51 +++++++++++--------
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/calculate_average_roman-r-m.sh b/calculate_average_roman-r-m.sh
index fe468dcec..b5d0b3d7a 100755
--- a/calculate_average_roman-r-m.sh
+++ b/calculate_average_roman-r-m.sh
@@ -19,7 +19,6 @@ JAVA_OPTS="--enable-preview -XX:+UseTransparentHugePages"
 
 # epsilon GC needs enough memory or it makes things worse
 # see https://stackoverflow.com/questions/58087596/why-are-repeated-memory-allocations-observed-to-be-slower-using-epsilon-vs-g1
-# 2GB seems to be the sweet spot
-JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xmx2G -Xms2G -XX:+AlwaysPreTouch"
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xmx1G -Xms1G -XX:+AlwaysPreTouch"
 
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_roman_r_m
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
index a7df56e07..1a43ae5ef 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
@@ -82,19 +82,30 @@ public Worker(FileChannel channel, long start, long end) {
 
         private void parseName(ByteString station) {
             long start = offset;
-            long pattern;
             long next = UNSAFE.getLong(offset);
-            while ((pattern = applyPattern(next, SEMICOLON_MASK)) == 0) {
-                offset += 8;
-                next = UNSAFE.getLong(offset);
+            long pattern = applyPattern(next, SEMICOLON_MASK);
+            int bytes;
+            if (pattern != 0) {
+                bytes = Long.numberOfTrailingZeros(pattern) / 8;
+                offset += bytes;
+                long h = Long.reverseBytes(next) >>> (8 * (8 - bytes));
+                station.hash = (int) (h ^ (h >>> 32));
+            }
+            else {
+                long h = next;
+                station.hash = (int) (h ^ (h >>> 32));
+                while (pattern == 0) {
+                    offset += 8;
+                    next = UNSAFE.getLong(offset);
+                    pattern = applyPattern(next, SEMICOLON_MASK);
+                }
+                bytes = Long.numberOfTrailingZeros(pattern) / 8;
+                offset += bytes;
             }
-            int bytes = Long.numberOfTrailingZeros(pattern) / 8;
-            offset += bytes;
 
             int len = (int) (offset - start);
             station.offset = start;
             station.len = len;
-            station.hash = 0;
             station.tail = next & ((1L << (8 * bytes)) - 1);
 
             offset++;
@@ -215,11 +226,9 @@ static final class ByteString {
             this.ms = ms;
         }
 
-        @Override
-        public String toString() {
-            var bytes = new byte[len];
-            UNSAFE.copyMemory(null, offset, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, len);
-            return new String(bytes, 0, len);
+        public String asString(byte[] reusable) {
+            UNSAFE.copyMemory(null, offset, reusable, Unsafe.ARRAY_BYTE_BASE_OFFSET, len);
+            return new String(reusable, 0, len);
         }
 
         public ByteString copy() {
@@ -243,9 +252,7 @@ public boolean equals(Object o) {
             if (len != that.len)
                 return false;
 
-            int i = 0;
-
-            for (; i + 7 < len; i += 8) {
+            for (int i = 0; i + 7 < len; i += 8) {
                 long l1 = UNSAFE.getLong(offset + i);
                 long l2 = UNSAFE.getLong(that.offset + i);
                 if (l1 != l2) {
@@ -257,13 +264,14 @@ public boolean equals(Object o) {
 
         @Override
         public int hashCode() {
-            if (hash == 0) {
-                long h = UNSAFE.getLong(offset);
-                h = Long.reverseBytes(h) >>> (8 * Math.max(0, 8 - len));
-                hash = (int) (h ^ (h >>> 32));
-            }
             return hash;
         }
+
+        @Override
+        public String toString() {
+            byte[] buf = new byte[100];
+            return asString(buf);
+        }
     }
 
     private static final class ResultRow {
@@ -318,10 +326,11 @@ ResultRow get(ByteString s) {
         }
 
         TreeMap<String, ResultRow> toMap() {
+            byte[] buf = new byte[100];
             var result = new TreeMap<String, ResultRow>();
             for (int i = 0; i < SIZE; i++) {
                 if (keys[i] != null) {
-                    result.put(keys[i].toString(), values[i]);
+                    result.put(keys[i].asString(buf), values[i]);
                 }
             }
             return result;

From d135bd95214d3c4a2cfa3f0bf70661cee2ebb17f Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 21 Jan 2024 18:01:52 +0100
Subject: [PATCH 091/268] Leaderboard update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0101ac2f1..ee357f2d1 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
 |   | 00:04.365 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
-|   | 00:04.627 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
+|   | 00:04.551 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |

From 47269cf30badaa035d3fac4ca5dce94cbb3056b1 Mon Sep 17 00:00:00 2001
From: Elliot Barlas <elliotbarlas@gmail.com>
Date: Sun, 21 Jan 2024 10:38:32 -0800
Subject: [PATCH 092/268] Adjust rolling hash function to operate at int-scale
 rather than byte-scale. Ensure 8-byte alignment in key buffer for faster
 comparisons. (#523)

---
 .../onebrc/CalculateAverage_ebarlas.java      | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
index c1ca6faac..87bba124b 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
@@ -32,11 +32,10 @@ public class CalculateAverage_ebarlas {
 
     private static final Arena ARENA = Arena.global();
 
-    private static final int MAX_KEY_SIZE = 100;
+    private static final int MAX_KEY_SIZE = 104; // 4 additional bytes to allow for single-int overflow due to padding
     private static final int MAX_VAL_SIZE = 5; // -dd.d
     private static final int MAX_LINE_SIZE = MAX_KEY_SIZE + MAX_VAL_SIZE + 2; // key, semicolon, val, newline
-    private static final int HASH_FACTOR = 433;
-    private static final int HASH_TBL_SIZE = 32_767; // range of allowed hash values, inclusive
+    private static final int HASH_TBL_SIZE = 131_071; // range of allowed hash values, inclusive
 
     private static final Unsafe UNSAFE = makeUnsafe();
 
@@ -182,7 +181,6 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
         long lineStart = cursor; // start of key in segment used for footer calc
         long limit = ms.address() + (complete ? ms.byteSize() : ms.byteSize() - MAX_LINE_SIZE); // stop short of longest line, sweep up at the end
         while (cursor < limit) { // one line per iteration
-            lineStart = cursor; // preserve line start
             int keyHash = 0; // key hash code
             long keyAddr = keyBaseAddr; // address for next int
             int keyArrLen = 0; // number of key 4-byte ints
@@ -196,6 +194,7 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
                 b2 = (byte) ((n >> 16) & 0xFF);
                 b3 = (byte) ((n >> 24) & 0xFF);
                 if (b0 == ';') { // ...;1.1
+                    UNSAFE.putInt(keyAddr, 0); // always pad with extra int to facilitate 8-byte aligned comparisons
                     keyLastBytes = 4;
                     b0 = b1;
                     b1 = b2;
@@ -205,10 +204,10 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
                 }
                 else if (b1 == ';') { // ...a;1.1
                     int k = n & 0xFF;
-                    UNSAFE.putInt(keyAddr, k);
+                    UNSAFE.putLong(keyAddr, k); // pad with extra int for comparison alignment
                     keyLastBytes = 1;
                     keyArrLen++;
-                    keyHash = HASH_FACTOR * keyHash + b0;
+                    keyHash += k;
                     b0 = b2;
                     b1 = b3;
                     b2 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
@@ -217,10 +216,10 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
                 }
                 else if (b2 == ';') { // ...ab;1.1
                     int k = n & 0xFFFF;
-                    UNSAFE.putInt(keyAddr, k);
+                    UNSAFE.putLong(keyAddr, k); // pad with extra int for comparison alignment
                     keyLastBytes = 2;
                     keyArrLen++;
-                    keyHash = HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1;
+                    keyHash += k;
                     b0 = b3;
                     b1 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
                     b2 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
@@ -229,10 +228,10 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
                 }
                 else if (b3 == ';') { // ...abc;1.1
                     int k = n & 0xFFFFFF;
-                    UNSAFE.putInt(keyAddr, k);
+                    UNSAFE.putLong(keyAddr, k); // pad with extra int for comparison alignment
                     keyLastBytes = 3;
                     keyArrLen++;
-                    keyHash = HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1) + b2;
+                    keyHash += k;
                     n = UNSAFE.getInt(cursor);
                     cursor += 4;
                     b0 = (byte) (n & 0xFF);
@@ -245,9 +244,10 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
                     UNSAFE.putInt(keyAddr, n);
                     keyArrLen++;
                     keyAddr += 4;
-                    keyHash = HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * (HASH_FACTOR * keyHash + b0) + b1) + b2) + b3;
+                    keyHash += n;
                 }
             }
+            keyHash ^= keyHash >>> 13;
             var idx = keyHash & HASH_TBL_SIZE;
             var st = stats[idx];
             if (st == null) { // nothing in table, eagerly claim spot
@@ -281,6 +281,7 @@ else if (!equals(st.keyAddr, st.keyLen, keyBaseAddr, keyArrLen)) {
             st.max = Math.max(st.max, val);
             st.sum += val;
             st.count++;
+            lineStart = cursor; // preserve line start
         }
         return lineStart - ms.address();
     }
@@ -289,21 +290,15 @@ private static boolean equals(long key1, int len1, long key2, int len2) {
         if (len1 != len2) {
             return false;
         }
-        if (len1 == 2) {
+        if (len1 <= 2) {
             return UNSAFE.getLong(key1) == UNSAFE.getLong(key2);
         }
-        if (len1 == 3) {
-            return UNSAFE.getLong(key1) == UNSAFE.getLong(key2) && UNSAFE.getInt(key1 + 8) == UNSAFE.getInt(key2 + 8);
-        }
-        if (len1 == 1) {
-            return UNSAFE.getInt(key1) == UNSAFE.getInt(key2);
-        }
-        if (len1 == 4) {
+        if (len1 <= 4) {
             return UNSAFE.getLong(key1) == UNSAFE.getLong(key2) && UNSAFE.getLong(key1 + 8) == UNSAFE.getLong(key2 + 8);
         }
-        for (int i = 0; i < len1; i++) {
+        for (int i = 0; i < len1; i += 2) {
             var offset = i << 2;
-            if (UNSAFE.getInt(key1 + offset) != UNSAFE.getInt(key2 + offset)) {
+            if (UNSAFE.getLong(key1 + offset) != UNSAFE.getLong(key2 + offset)) {
                 return false;
             }
         }
@@ -324,7 +319,7 @@ private static Stats findInTable(Stats[] stats, int hash, long keyAddr, int keyL
     }
 
     private static Stats newStats(long keyAddr, int keyLen, int keyLastBytes, int hash) {
-        var bytes = keyLen << 2;
+        var bytes = (keyLen + 1) << 2; // include overflow chunk
         long k = UNSAFE.allocateMemory(bytes);
         UNSAFE.copyMemory(keyAddr, k, bytes);
         return new Stats(k, keyLen, keyLastBytes, hash);

From 6b95ac6113ccbe17c951980affbba801ac8efe50 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Sun, 21 Jan 2024 19:54:43 +0100
Subject: [PATCH 093/268] optimize branches (#534)

---
 .../CalculateAverage_artsiomkorzun.java       | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index 13731546f..ca76d10ea 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -140,6 +140,7 @@ private static class Aggregates {
 
         private static final int ENTRIES = 64 * 1024;
         private static final int SIZE = 128 * ENTRIES;
+        private static final int MASK = (ENTRIES - 1) << 7;
 
         private final ByteBuffer buffer = allocate(SIZE);
         private final long pointer = address(buffer);
@@ -261,7 +262,7 @@ private static void alloc(long reference, int length, int hash, long address) {
         }
 
         private static int offset(int hash) {
-            return ((hash) & (ENTRIES - 1)) << 7;
+            return hash & MASK;
         }
 
         private static int next(int prev) {
@@ -361,7 +362,6 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                 int length;
                 int hash;
 
-                long ptr = 0;
                 long word = word(position);
                 long separator = separator(word);
 
@@ -369,7 +369,12 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                     length = length(separator);
                     word = mask(word, separator);
                     hash = mix(word);
-                    ptr = aggregates.find(word, hash);
+                    long ptr = aggregates.find(word, hash);
+
+                    if (ptr != 0) {
+                        position = update(ptr, position + length);
+                        continue;
+                    }
                 }
                 else {
                     long word0 = word;
@@ -380,7 +385,12 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                         length = length(separator) + 8;
                         word = mask(word, separator);
                         hash = mix(word ^ word0);
-                        ptr = aggregates.find(word0, word, hash);
+                        long ptr = aggregates.find(word0, word, hash);
+
+                        if (ptr != 0) {
+                            position = update(ptr, position + length);
+                            continue;
+                        }
                     }
                     else {
                         length = 16;
@@ -404,10 +414,7 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                     }
                 }
 
-                if (ptr == 0) {
-                    ptr = aggregates.put(position, word, length, hash);
-                }
-
+                long ptr = aggregates.put(position, word, length, hash);
                 position = update(ptr, position + length);
             }
         }

From d0a28599c293d3afe3291fc3cf169a7b25ae9ae6 Mon Sep 17 00:00:00 2001
From: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
Date: Sun, 21 Jan 2024 20:13:48 +0100
Subject: [PATCH 094/268] Tuning and subprocess spawn for thomaswue (#533)

* Some clean up, small-scale tuning, and reduce complexity when handling longer names.

* Do actual work in worker subprocess. Main process returns immediately
and OS clean up of the mmap continues in the subprocess.

* Update minor Graal version after CPU release.

* Turn GC back to epsilon GC (although it does not seem to make a
difference).

* Minor tuning for another +1%.
---
 prepare_thomaswue.sh                          |   4 +-
 .../onebrc/CalculateAverage_thomaswue.java    | 167 ++++++++++--------
 2 files changed, 99 insertions(+), 72 deletions(-)

diff --git a/prepare_thomaswue.sh b/prepare_thomaswue.sh
index 1c6be6494..32616a958 100755
--- a/prepare_thomaswue.sh
+++ b/prepare_thomaswue.sh
@@ -16,11 +16,11 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_thomaswue_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
     # Use -H:MethodFilter=CalculateAverage_thomaswue.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_thomaswue_image dev.morling.onebrc.CalculateAverage_thomaswue
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
index 041c17ca9..406c85d38 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
@@ -15,13 +15,10 @@
  */
 package dev.morling.onebrc;
 
-import sun.misc.Unsafe;
-
 import java.io.IOException;
-import java.lang.foreign.Arena;
-import java.lang.reflect.Field;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
-import java.nio.channels.FileChannel.MapMode;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
@@ -39,18 +36,20 @@
  */
 public class CalculateAverage_thomaswue {
     private static final String FILE = "./measurements.txt";
+    private static final int MIN_TEMP = -999;
+    private static final int MAX_TEMP = 999;
 
     // Holding the current result for a single city.
     private static class Result {
-        long lastNameLong, secondLastNameLong, nameAddress;
-        int nameLength, remainingShift;
-        int min, max, count;
+        long lastNameLong, secondLastNameLong;
+        long[] name;
+        int count;
+        short min, max;
         long sum;
 
-        private Result(long nameAddress) {
-            this.nameAddress = nameAddress;
-            this.min = Integer.MAX_VALUE;
-            this.max = Integer.MIN_VALUE;
+        private Result() {
+            this.min = MAX_TEMP;
+            this.max = MIN_TEMP;
         }
 
         public String toString() {
@@ -63,18 +62,32 @@ private static double round(double value) {
 
         // Accumulate another result into this one.
         private void add(Result other) {
-            min = Math.min(min, other.min);
-            max = Math.max(max, other.max);
+            if (other.min < min) {
+                min = other.min;
+            }
+            if (other.max > max) {
+                max = other.max;
+            }
             sum += other.sum;
             count += other.count;
         }
 
         public String calcName() {
-            return new Scanner(nameAddress, nameAddress + nameLength).getString(nameLength);
+            ByteBuffer bb = ByteBuffer.allocate(name.length * Long.BYTES).order(ByteOrder.nativeOrder());
+            bb.asLongBuffer().put(name);
+            byte[] array = bb.array();
+            int i = 0;
+            while (array[i++] != ';')
+                ;
+            return new String(array, 0, i - 1, StandardCharsets.UTF_8);
         }
     }
 
     public static void main(String[] args) throws IOException {
+        if (args.length == 0 || !("--worker".equals(args[0]))) {
+            spawnWorker();
+            return;
+        }
         // Calculate input segments.
         int numberOfChunks = Runtime.getRuntime().availableProcessors();
         long[] chunks = getSegments(numberOfChunks);
@@ -93,6 +106,22 @@ public static void main(String[] args) throws IOException {
 
         // Final output.
         System.out.println(accumulateResults(allResults));
+        System.out.close();
+    }
+
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .inheritIO()
+                .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
     }
 
     // Accumulate results sequentially for simplicity.
@@ -115,20 +144,21 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
         Result[] results = new Result[1 << 17];
         Scanner scanner = new Scanner(chunkStart, chunkEnd);
         long word = scanner.getLong();
-        int pos = findDelimiter(word);
+        long pos = findDelimiter(word);
         while (scanner.hasNext()) {
             long nameAddress = scanner.pos();
             long hash = 0;
 
             // Search for ';', one long at a time.
-            if (pos != 8) {
+            if (pos != 0) {
+                pos = Long.numberOfTrailingZeros(pos) >>> 3;
                 scanner.add(pos);
                 word = mask(word, pos);
                 hash = word;
 
                 int number = scanNumber(scanner);
                 long nextWord = scanner.getLong();
-                int nextPos = findDelimiter(nextWord);
+                long nextPos = findDelimiter(nextWord);
 
                 Result existingResult = results[hashToIndex(hash, results)];
                 if (existingResult != null && existingResult.lastNameLong == word) {
@@ -142,11 +172,12 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
             }
             else {
                 scanner.add(8);
-                hash ^= word;
+                hash = word;
                 long prevWord = word;
                 word = scanner.getLong();
                 pos = findDelimiter(word);
-                if (pos != 8) {
+                if (pos != 0) {
+                    pos = Long.numberOfTrailingZeros(pos) >>> 3;
                     scanner.add(pos);
                     word = mask(word, pos);
                     hash ^= word;
@@ -166,7 +197,8 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
                     while (true) {
                         word = scanner.getLong();
                         pos = findDelimiter(word);
-                        if (pos != 8) {
+                        if (pos != 0) {
+                            pos = Long.numberOfTrailingZeros(pos) >>> 3;
                             scanner.add(pos);
                             word = mask(word, pos);
                             hash ^= word;
@@ -182,12 +214,7 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
 
             // Save length of name for later.
             int nameLength = (int) (scanner.pos() - nameAddress);
-            scanner.add(1);
-
-            long numberWord = scanner.getLong();
-            int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
-            int number = convertIntoNumber(decimalSepPos, numberWord);
-            scanner.add((decimalSepPos >>> 3) + 3);
+            int number = scanNumber(scanner);
 
             // Final calculation for index into hash table.
             int tableIndex = hashToIndex(hash, results);
@@ -198,13 +225,16 @@ private static Result[] parseLoop(long chunkStart, long chunkEnd) {
                 }
                 // Check for collision.
                 int i = 0;
+                int namePos = 0;
                 for (; i < nameLength + 1 - 8; i += 8) {
-                    if (scanner.getLongAt(existingResult.nameAddress + i) != scanner.getLongAt(nameAddress + i)) {
+                    if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
                         tableIndex = (tableIndex + 31) & (results.length - 1);
                         continue outer;
                     }
                 }
-                if (((existingResult.lastNameLong ^ scanner.getLongAt(nameAddress + i)) << existingResult.remainingShift) == 0) {
+
+                int remainingShift = (64 - (nameLength + 1 - i) << 3);
+                if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
                     record(existingResult, number);
                     break;
                 }
@@ -230,63 +260,67 @@ private static int scanNumber(Scanner scanPtr) {
     }
 
     private static void record(Result existingResult, int number) {
-        existingResult.min = Math.min(existingResult.min, number);
-        existingResult.max = Math.max(existingResult.max, number);
+        if (number < existingResult.min) {
+            existingResult.min = (short) number;
+        }
+        if (number > existingResult.max) {
+            existingResult.max = (short) number;
+        }
         existingResult.sum += number;
         existingResult.count++;
     }
 
     private static int hashToIndex(long hash, Result[] results) {
         int hashAsInt = (int) (hash ^ (hash >>> 28));
-        int finalHash = (hashAsInt ^ (hashAsInt >>> 15));
+        int finalHash = (hashAsInt ^ (hashAsInt >>> 17));
         return (finalHash & (results.length - 1));
     }
 
-    private static long mask(long word, int pos) {
-        return word & (-1L >>> ((8 - pos - 1) << 3));
+    private static long mask(long word, long pos) {
+        return (word << ((7 - pos) << 3));
     }
 
-    // Special method to convert a number in the specific format into an int value without branches created by
-    // Quan Anh Mai.
+    // Special method to convert a number in the ascii number into an int without branches created by Quan Anh Mai.
     private static int convertIntoNumber(int decimalSepPos, long numberWord) {
         int shift = 28 - decimalSepPos;
         // signed is -1 if negative, 0 otherwise
         long signed = (~numberWord << 59) >> 63;
         long designMask = ~(signed & 0xFF);
-        // Align the number to a specific position and transform the ascii code
-        // to actual digit value in each byte
+        // Align the number to a specific position and transform the ascii to digit value
         long digits = ((numberWord & designMask) << shift) & 0x0F000F0F00L;
-
         // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
         // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
-        // 0x000000UU00TTHH00 +
-        // 0x00UU00TTHH000000 * 10 +
-        // 0xUU00TTHH00000000 * 100
-        // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
-        // This results in our value lies in the bit 32 to 41 of this product
-        // That was close :)
+        // 0x000000UU00TTHH00 + 0x00UU00TTHH000000 * 10 + 0xUU00TTHH00000000 * 100
         long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
         long value = (absValue ^ signed) - signed;
         return (int) value;
     }
 
-    private static int findDelimiter(long word) {
+    private static long findDelimiter(long word) {
         long input = word ^ 0x3B3B3B3B3B3B3B3BL;
         long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
-        return Long.numberOfTrailingZeros(tmp) >>> 3;
+        return tmp;
     }
 
     private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner) {
-        Result r = new Result(nameAddress);
+        Result r = new Result();
         results[hash] = r;
-
+        long[] name = new long[(nameLength / Long.BYTES) + 1];
+        int pos = 0;
         int i = 0;
-        for (; i < nameLength + 1 - 8; i += 8) {
-            r.secondLastNameLong = (scanner.getLongAt(nameAddress + i));
+        for (; i < nameLength + 1 - Long.BYTES; i += Long.BYTES) {
+            name[pos++] = scanner.getLongAt(nameAddress + i);
+        }
+
+        if (pos > 0) {
+            r.secondLastNameLong = name[pos - 1];
         }
-        r.remainingShift = (64 - (nameLength + 1 - i) << 3);
-        r.lastNameLong = (scanner.getLongAt(nameAddress + i) & (-1L >>> r.remainingShift));
-        r.nameLength = nameLength;
+
+        int remainingShift = (64 - (nameLength + 1 - i) << 3);
+        long lastWord = (scanner.getLongAt(nameAddress + i) << remainingShift);
+        r.lastNameLong = lastWord;
+        name[pos] = lastWord >> remainingShift;
+        r.name = name;
         return r;
     }
 
@@ -295,16 +329,15 @@ private static long[] getSegments(int numberOfChunks) throws IOException {
             long fileSize = fileChannel.size();
             long segmentSize = (fileSize + numberOfChunks - 1) / numberOfChunks;
             long[] chunks = new long[numberOfChunks + 1];
-            long mappedAddress = fileChannel.map(MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
+            long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
             chunks[0] = mappedAddress;
             long endAddress = mappedAddress + fileSize;
             Scanner s = new Scanner(mappedAddress, mappedAddress + fileSize);
             for (int i = 1; i < numberOfChunks; ++i) {
                 long chunkAddress = mappedAddress + i * segmentSize;
                 // Align to first row start.
-                while (chunkAddress < endAddress && (s.getLongAt(chunkAddress++) & 0xFF) != '\n') {
-                    // nop
-                }
+                while (chunkAddress < endAddress && (s.getLongAt(chunkAddress++) & 0xFF) != '\n')
+                    ;
                 chunks[i] = Math.min(chunkAddress, endAddress);
             }
             chunks[numberOfChunks] = endAddress;
@@ -314,13 +347,13 @@ private static long[] getSegments(int numberOfChunks) throws IOException {
 
     private static class Scanner {
 
-        private static final Unsafe UNSAFE = initUnsafe();
+        private static final sun.misc.Unsafe UNSAFE = initUnsafe();
 
-        private static Unsafe initUnsafe() {
+        private static sun.misc.Unsafe initUnsafe() {
             try {
-                Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+                java.lang.reflect.Field theUnsafe = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
                 theUnsafe.setAccessible(true);
-                return (Unsafe) theUnsafe.get(Unsafe.class);
+                return (sun.misc.Unsafe) theUnsafe.get(sun.misc.Unsafe.class);
             }
             catch (NoSuchFieldException | IllegalAccessException e) {
                 throw new RuntimeException(e);
@@ -342,7 +375,7 @@ long pos() {
             return pos;
         }
 
-        void add(int delta) {
+        void add(long delta) {
             pos += delta;
         }
 
@@ -354,13 +387,7 @@ long getLongAt(long pos) {
             return UNSAFE.getLong(pos);
         }
 
-        public String getString(int nameLength) {
-            byte[] bytes = new byte[nameLength];
-            UNSAFE.copyMemory(null, pos, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameLength);
-            return new String(bytes, StandardCharsets.UTF_8);
-        }
-
-        public void setPos(long l) {
+        void setPos(long l) {
             this.pos = l;
         }
     }

From d8b071c878b8a3a7cfd9451fe8c6bd9deb4d3ebf Mon Sep 17 00:00:00 2001
From: Roy van Rijn <roy.van.rijn@gmail.com>
Date: Sun, 21 Jan 2024 11:15:07 -0800
Subject: [PATCH 095/268] Reverting ByteBuffer idea, using Thomas's trick
 instead. (#538)

---
 prepare_royvanrijn.sh                         |  2 +-
 .../onebrc/CalculateAverage_royvanrijn.java   | 44 +++++++++++++++----
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/prepare_royvanrijn.sh b/prepare_royvanrijn.sh
index ba89535d7..a9789d6c0 100755
--- a/prepare_royvanrijn.sh
+++ b/prepare_royvanrijn.sh
@@ -16,7 +16,7 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_royvanrijn_image ]; then
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
index b392e5801..68004565c 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
@@ -15,12 +15,15 @@
  */
 package dev.morling.onebrc;
 
+import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.stream.Collectors;
@@ -102,8 +105,27 @@ public class CalculateAverage_royvanrijn {
     private static final int TABLE_SIZE = 1 << 19; // large enough for the contest.
     private static final int TABLE_MASK = (TABLE_SIZE - 1);
 
-    public static void main(String[] args) throws Exception {
+    // Idea of thomaswue, don't wait for slow unmap:
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .inheritIO()
+                .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
 
+    public static void main(String[] args) throws Exception {
+        if (args.length == 0 || !("--worker".equals(args[0]))) {
+            spawnWorker();
+            return;
+        }
         // Calculate input segments.
         final FileChannel fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
         final long fileSize = fileChannel.size();
@@ -159,7 +181,7 @@ public static void main(String[] args) throws Exception {
                         .collect(Collectors.joining(", ")));
         System.out.println("}");
 
-        // System.out.println(measurements.entrySet().stream().mapToLong(e -> UNSAFE.getInt(e.getValue(), ENTRY_COUNT + Unsafe.ARRAY_BYTE_BASE_OFFSET)).sum());
+        System.out.close(); // close the stream to stop
     }
 
     private static byte[] fillEntry(final byte[] entry, final long fromAddress, final int length, final int temp) {
@@ -176,15 +198,15 @@ public static void updateEntry(final byte[] entry, final int temp) {
 
         int entryMin = UNSAFE.getInt(entry, ENTRY_MIN);
         int entryMax = UNSAFE.getInt(entry, ENTRY_MAX);
-
-        entryMin = Math.min(temp, entryMin);
-        entryMax = Math.max(temp, entryMax);
-
         long entrySum = UNSAFE.getLong(entry, ENTRY_SUM) + temp;
         int entryCount = UNSAFE.getInt(entry, ENTRY_COUNT) + 1;
 
-        UNSAFE.putInt(entry, ENTRY_MIN, entryMin);
-        UNSAFE.putInt(entry, ENTRY_MAX, entryMax);
+        if (temp < entryMin) {
+            UNSAFE.putInt(entry, ENTRY_MIN, temp);
+        }
+        else if (temp > entryMax) {
+            UNSAFE.putInt(entry, ENTRY_MAX, temp);
+        }
         UNSAFE.putInt(entry, ENTRY_COUNT, entryCount);
         UNSAFE.putLong(entry, ENTRY_SUM, entrySum);
     }
@@ -435,6 +457,8 @@ private static byte[][] processMemoryArea(final long startAddress, final long en
             reader.processStart();
 
             if (!reader.readFirst()) {
+                // Found delimiter in first 8 bytes:
+
                 int temperature = reader.processEndAndGetTemperature();
 
                 // Find or insert the entry:
@@ -462,6 +486,7 @@ else if (reader.matchesEnding(entry)) {
                 reader.processName();
 
                 if (!reader.readNext()) {
+                    // Found delimiter in 8-16 bytes:
 
                     int temperature = reader.processEndAndGetTemperature();
 
@@ -490,6 +515,8 @@ else if (reader.matchesEntryShort(entry)) {
                     reader.processName();
 
                     if (!reader.readNext()) {
+                        // Found delimiter in 16-24 bytes:
+
                         int temperature = reader.processEndAndGetTemperature();
 
                         // Find or insert the entry:
@@ -515,6 +542,7 @@ else if (reader.matchesEntryMedium(entry)) {
 
                     }
                     else {
+                        // Need more than 24 bytes:
 
                         reader.processName();
                         while (reader.readNext()) {

From ac4805ee45d9d626d0ef93d3cbe6191b81d9e393 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Sun, 21 Jan 2024 20:23:48 +0100
Subject: [PATCH 096/268] subprocess spawner (#542)

---
 calculate_average_artsiomkorzun.sh            |   4 +-
 prepare_artsiomkorzun.sh                      |   2 +-
 .../CalculateAverage_artsiomkorzun.java       | 115 +++++++++++-------
 3 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/calculate_average_artsiomkorzun.sh b/calculate_average_artsiomkorzun.sh
index 977b6e320..d9c18284e 100755
--- a/calculate_average_artsiomkorzun.sh
+++ b/calculate_average_artsiomkorzun.sh
@@ -17,9 +17,9 @@
 
 if [ -f target/CalculateAverage_artsiomkorzun_image ]; then
     echo "Picking up existing native image 'target/CalculateAverage_artsiomkorzun_image', delete the file to select JVM mode." 1>&2
-    target/CalculateAverage_artsiomkorzun_image -XX:MaxDirectMemorySize=4294967296
+    target/CalculateAverage_artsiomkorzun_image
 else
-    JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation -XX:MaxDirectMemorySize=4294967296"
+    JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation"
     echo "Chosing to run the app in JVM mode as no native image was found, use prepare_artsiomkorzun.sh to generate." 1>&2
     java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_artsiomkorzun
 fi
\ No newline at end of file
diff --git a/prepare_artsiomkorzun.sh b/prepare_artsiomkorzun.sh
index 9ae693a79..984048691 100755
--- a/prepare_artsiomkorzun.sh
+++ b/prepare_artsiomkorzun.sh
@@ -16,7 +16,7 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
 
 if [ ! -f target/CalculateAverage_artsiomkorzun_image ]; then
     NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=64m --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_artsiomkorzun"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index ca76d10ea..40b8db05a 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -17,12 +17,14 @@
 
 import sun.misc.Unsafe;
 
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
 import java.lang.reflect.Field;
-import java.nio.Buffer;
-import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
-import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -31,21 +33,19 @@
 public class CalculateAverage_artsiomkorzun {
 
     private static final Path FILE = Path.of("./measurements.txt");
-    private static final int SEGMENT_SIZE = 4 * 1024 * 1024;
-    private static final int SEGMENT_OVERLAP = 128;
+    private static final long SEGMENT_SIZE = 4 * 1024 * 1024;
+    private static final long SEGMENT_OVERLAP = 128;
     private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
 
     private static final Unsafe UNSAFE;
-    private static final long ADDRESS_OFFSET;
 
     static {
         try {
             Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
             unsafe.setAccessible(true);
             UNSAFE = (Unsafe) unsafe.get(Unsafe.class);
-            ADDRESS_OFFSET = UNSAFE.objectFieldOffset(Buffer.class.getDeclaredField("address"));
         }
         catch (Throwable e) {
             throw new RuntimeException(e);
@@ -60,11 +60,42 @@ public static void main(String[] args) throws Exception {
         // System.err.println("Time: " + (end - start));
         // }
 
+        if (isSpawn(args)) {
+            spawn();
+            return;
+        }
+
         execute();
     }
 
+    private static boolean isSpawn(String[] args) {
+        for (String arg : args) {
+            if ("--worker".equals(arg)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    private static void spawn() throws Exception {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> commands = new ArrayList<>();
+        info.command().ifPresent(commands::add);
+        info.arguments().ifPresent(args -> commands.addAll(Arrays.asList(args)));
+        commands.add("--worker");
+
+        new ProcessBuilder()
+                .command(commands)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
     private static void execute() throws Exception {
-        long fileSize = Files.size(FILE);
+        MemorySegment fileMemory = map(FILE);
+        long fileAddress = fileMemory.address();
+        long fileSize = fileMemory.byteSize();
         int segmentCount = (int) ((fileSize + SEGMENT_SIZE - 1) / SEGMENT_SIZE);
 
         AtomicInteger counter = new AtomicInteger();
@@ -74,7 +105,7 @@ private static void execute() throws Exception {
         Aggregator[] aggregators = new Aggregator[parallelism];
 
         for (int i = 0; i < aggregators.length; i++) {
-            aggregators[i] = new Aggregator(counter, result, segmentCount);
+            aggregators[i] = new Aggregator(counter, result, fileAddress, fileSize, segmentCount);
             aggregators[i].start();
         }
 
@@ -84,18 +115,17 @@ private static void execute() throws Exception {
 
         Map<String, Aggregate> aggregates = result.get().aggregate();
         System.out.println(text(aggregates));
+        System.out.close();
     }
 
-    private static long address(ByteBuffer buffer) {
-        return UNSAFE.getLong(buffer, ADDRESS_OFFSET);
-    }
-
-    private static ByteBuffer allocate(int size) {
-        ByteBuffer buffer = ByteBuffer.allocateDirect(size + 4096);
-        long address = address(buffer);
-        long aligned = (address + 4095) & (~4095);
-        int padding = (int) (aligned - address);
-        return buffer.position(padding).limit(padding + size).slice();
+    private static MemorySegment map(Path file) {
+        try (FileChannel channel = FileChannel.open(file, StandardOpenOption.READ)) {
+            long size = channel.size();
+            return channel.map(FileChannel.MapMode.READ_ONLY, 0, size, Arena.global());
+        }
+        catch (Throwable e) {
+            throw new RuntimeException(e);
+        }
     }
 
     private static long word(long address) {
@@ -142,8 +172,13 @@ private static class Aggregates {
         private static final int SIZE = 128 * ENTRIES;
         private static final int MASK = (ENTRIES - 1) << 7;
 
-        private final ByteBuffer buffer = allocate(SIZE);
-        private final long pointer = address(buffer);
+        private final long pointer;
+
+        public Aggregates() {
+            long address = UNSAFE.allocateMemory(SIZE + 4096);
+            pointer = (address + 4095) & (~4095);
+            UNSAFE.setMemory(pointer, SIZE, (byte) 0);
+        }
 
         public long find(long word, int hash) {
             long address = pointer + offset(hash);
@@ -308,39 +343,35 @@ private static class Aggregator extends Thread {
 
         private final AtomicInteger counter;
         private final AtomicReference<Aggregates> result;
-        private final int segments;
+        private final long fileAddress;
+        private final long fileSize;
+        private final int segmentCount;
 
-        public Aggregator(AtomicInteger counter, AtomicReference<Aggregates> result, int segments) {
+        public Aggregator(AtomicInteger counter, AtomicReference<Aggregates> result,
+                          long fileAddress, long fileSize, int segmentCount) {
             super("aggregator");
             this.counter = counter;
             this.result = result;
-            this.segments = segments;
+            this.fileAddress = fileAddress;
+            this.fileSize = fileSize;
+            this.segmentCount = segmentCount;
         }
 
         @Override
         public void run() {
             Aggregates aggregates = new Aggregates();
-            ByteBuffer buffer = allocate(SEGMENT_SIZE + SEGMENT_OVERLAP);
-
-            try (FileChannel channel = FileChannel.open(FILE)) {
-                for (int segment; (segment = counter.getAndIncrement()) < segments;) {
-                    buffer.clear();
 
-                    long position = (long) SEGMENT_SIZE * segment;
-                    int size = channel.read(buffer, position);
+            for (int segment; (segment = counter.getAndIncrement()) < segmentCount;) {
+                long position = SEGMENT_SIZE * segment;
+                long size = Math.min(SEGMENT_SIZE + SEGMENT_OVERLAP, fileSize - position);
+                long address = fileAddress + position;
+                long limit = address + Math.min(SEGMENT_SIZE, size - 1);
 
-                    long address = address(buffer);
-                    long limit = address + Math.min(SEGMENT_SIZE, size - 1);
-
-                    if (segment > 0) {
-                        address = next(address);
-                    }
-
-                    aggregate(aggregates, address, limit);
+                if (segment > 0) {
+                    address = next(address);
                 }
-            }
-            catch (Throwable e) {
-                throw new RuntimeException(e);
+
+                aggregate(aggregates, address, limit);
             }
 
             while (!result.compareAndSet(null, aggregates)) {

From 81963f963cc3efcbb1247de233255deb6119aaa5 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 21 Jan 2024 20:39:20 +0100
Subject: [PATCH 097/268] Leaderboard update

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ee357f2d1..9cb9cf9b3 100644
--- a/README.md
+++ b/README.md
@@ -41,11 +41,11 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:02.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
-| 2 | 00:02.461 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
-| 3* | 00:02.552 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
-| 3* | 00:02.571 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
-| 3* | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
+| 1 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
+| 2 | 00:02.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
+| 3* | 00:02.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
+| 3* | 00:02.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
+|   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
 |   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
@@ -54,9 +54,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
+|   | 00:03.990 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
-|   | 00:04.365 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
 |   | 00:04.551 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
 |   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |

From d2639b7ce144ab453e36a36fb6dd3ecf19ed5123 Mon Sep 17 00:00:00 2001
From: Li Lin <linl33@users.noreply.github.com>
Date: Mon, 22 Jan 2024 04:14:05 +0800
Subject: [PATCH 098/268] Add linl33's implementation (#503)

* Add linl33's implementation

* Update evaluate.sh

---------

Co-authored-by: Gunnar Morling <gunnar.morling@googlemail.com>
---
 calculate_average_linl33.sh                   |  38 ++
 evaluate.sh                                   |   2 +
 pom.xml                                       |  34 +-
 prepare_linl33.sh                             |  46 ++
 .../onebrc/CalculateAverage_linl33.java       | 520 ++++++++++++++++++
 5 files changed, 639 insertions(+), 1 deletion(-)
 create mode 100755 calculate_average_linl33.sh
 create mode 100755 prepare_linl33.sh
 create mode 100644 src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java

diff --git a/calculate_average_linl33.sh b/calculate_average_linl33.sh
new file mode 100755
index 000000000..5610895d1
--- /dev/null
+++ b/calculate_average_linl33.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="-Xrs --enable-preview --add-modules jdk.incubator.vector --enable-native-access=ALL-UNNAMED"
+JAVA_OPTS="${JAVA_OPTS} -XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions"
+JAVA_OPTS="${JAVA_OPTS} -Xms128m -XX:+AlwaysPreTouch -XX:+AlwaysPreTouchStacks -XX:-UseTransparentHugePages"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseCompressedClassPointers -XX:+ForceUnreachable -XX:-CompactStrings"
+JAVA_OPTS="${JAVA_OPTS} -XX:CodeEntryAlignment=64 -XX:OptoLoopAlignment=64 -XX:MaxLoopPad=16 -XX:ObjectAlignmentInBytes=64"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseLoopPredicate -XX:LoopStripMiningIter=0 -XX:LoopStripMiningIterShortLoop=0"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseCountedLoopSafepoints -XX:GuaranteedSafepointInterval=0 -XX:AllocatePrefetchStyle=0"
+JAVA_OPTS="${JAVA_OPTS} -XX:+TrustFinalNonStaticFields -XX:LockingMode=2 -XX:+UseSystemMemoryBarrier"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseDynamicNumberOfCompilerThreads -XX:-UseDynamicNumberOfGCThreads"
+JAVA_OPTS="${JAVA_OPTS} -XX:ArchiveRelocationMode=0 -XX:-UsePerfData -XX:-UseNotificationThread -XX:-CheckIntrinsics"
+#JAVA_OPTS="${JAVA_OPTS} -XX:+UseZGC -XX:-ZProactive -XX:+ZCollectionIntervalOnly -XX:ZCollectionInterval=0 -XX:-ZUncommit -XX:-ZBufferStoreBarriers -XX:ZIndexDistributorStrategy=1"
+JAVA_OPTS="${JAVA_OPTS} -XX:+UseEpsilonGC -XX:-UseCompressedOops"
+#JAVA_OPTS="${JAVA_OPTS} -XX:+UseParallelGC -XX:-UseCompressedOops"
+#JAVA_OPTS="${JAVA_OPTS} -XX:+UseG1GC -XX:-UseCompressedOops"
+JAVA_OPTS="${JAVA_OPTS} -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 -Djava.lang.invoke.VarHandle.VAR_HANDLE_GUARDS=false -Djava.lang.invoke.MethodHandle.DONT_INLINE_THRESHOLD=-1"
+JAVA_OPTS="${JAVA_OPTS} -Dfile.encoding=UTF-8 -Dsun.stdout.encoding=UTF-8 -Dsun.stderr.encoding=UTF-8"
+
+JAVA_OPTS="${JAVA_OPTS} -Xlog:all=off -Xverify:none -XX:SharedArchiveFile=target/CalculateAverage_linl33_dynamic.jsa"
+
+MALLOC_ARENA_MAX=1 java ${JAVA_OPTS} --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_linl33 2>/dev/null
diff --git a/evaluate.sh b/evaluate.sh
index f92a4562f..caafbb15c 100755
--- a/evaluate.sh
+++ b/evaluate.sh
@@ -36,6 +36,7 @@ RESET='\033[0m' # No Color
 MEASUREMENTS_FILE="measurements_1B.txt"
 RUNS=5
 DEFAULT_JAVA_VERSION="21.0.1-open"
+: "${BUILD_JAVA_VERSION:=21.0.1-open}"
 RUN_TIME_LIMIT=300 # seconds
 
 TIMEOUT=""
@@ -115,6 +116,7 @@ if [ -f "/sys/devices/system/cpu/cpufreq/boost" ]; then
   fi
 fi
 
+print_and_execute sdk use java $BUILD_JAVA_VERSION
 print_and_execute java --version
 print_and_execute ./mvnw --quiet clean verify
 
diff --git a/pom.xml b/pom.xml
index a2c91e3bd..79354bd92 100644
--- a/pom.xml
+++ b/pom.xml
@@ -83,7 +83,11 @@
           <artifactId>formatter-maven-plugin</artifactId>
           <version>2.16.0</version>
           <configuration>
-              <configFile>etc/eclipse-formatter-config.xml</configFile>
+            <configFile>etc/eclipse-formatter-config.xml</configFile>
+            <directories>
+              <directory>${project.build.sourceDirectory}</directory>
+              <directory>${project.basedir}/src/main/java-22</directory>
+            </directories>
           </configuration>
         </plugin>
         <plugin>
@@ -287,5 +291,33 @@
         <skipTests>true</skipTests>
       </properties>
     </profile>
+    <profile>
+      <id>jdk22</id>
+      <activation>
+        <jdk>22</jdk>
+      </activation>
+      <properties>
+      </properties>
+      <build>
+        <pluginManagement>
+          <plugins>
+            <plugin>
+              <artifactId>maven-compiler-plugin</artifactId>
+              <configuration>
+                <release>22</release>
+                <compileSourceRoots>
+                  <compileSourceRoot>${project.basedir}/src/main/java-22</compileSourceRoot>
+                  <!--
+                    Uncomment the next line to build the entire project with jdk22.
+                    Currently, some classes fail to compile under jdk22.
+                  -->
+                  <!-- <compileSourceRoot>${project.build.sourceDirectory}</compileSourceRoot> -->
+                </compileSourceRoots>
+              </configuration>
+            </plugin>
+          </plugins>
+        </pluginManagement>
+      </build>
+    </profile>
   </profiles>
 </project>
diff --git a/prepare_linl33.sh b/prepare_linl33.sh
new file mode 100755
index 000000000..5fdf640a6
--- /dev/null
+++ b/prepare_linl33.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+# TODO: bump to ea 32 when available
+sdk use java 22.ea.31-open 1>&2
+
+CLASS_NAME="CalculateAverage_linl33"
+
+JAVA_OPTS="-Xrs --enable-preview --add-modules jdk.incubator.vector --enable-native-access=ALL-UNNAMED"
+JAVA_OPTS="${JAVA_OPTS} -XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions"
+JAVA_OPTS="${JAVA_OPTS} -Xms128m -XX:+AlwaysPreTouch -XX:+AlwaysPreTouchStacks -XX:-UseTransparentHugePages"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseCompressedClassPointers -XX:+ForceUnreachable -XX:-CompactStrings"
+JAVA_OPTS="${JAVA_OPTS} -XX:CodeEntryAlignment=64 -XX:OptoLoopAlignment=64 -XX:MaxLoopPad=16 -XX:ObjectAlignmentInBytes=64"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseLoopPredicate -XX:LoopStripMiningIter=0 -XX:LoopStripMiningIterShortLoop=0"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseCountedLoopSafepoints -XX:GuaranteedSafepointInterval=0 -XX:AllocatePrefetchStyle=0"
+JAVA_OPTS="${JAVA_OPTS} -XX:+TrustFinalNonStaticFields -XX:LockingMode=2 -XX:+UseSystemMemoryBarrier"
+JAVA_OPTS="${JAVA_OPTS} -XX:-UseDynamicNumberOfCompilerThreads -XX:-UseDynamicNumberOfGCThreads"
+JAVA_OPTS="${JAVA_OPTS} -XX:ArchiveRelocationMode=0 -XX:-UsePerfData -XX:-UseNotificationThread -XX:-CheckIntrinsics"
+#JAVA_OPTS="${JAVA_OPTS} -XX:+UseZGC -XX:-ZProactive -XX:+ZCollectionIntervalOnly -XX:ZCollectionInterval=0 -XX:-ZUncommit -XX:-ZBufferStoreBarriers -XX:ZIndexDistributorStrategy=1"
+JAVA_OPTS="${JAVA_OPTS} -XX:+UseEpsilonGC -XX:-UseCompressedOops"
+#JAVA_OPTS="${JAVA_OPTS} -XX:+UseParallelGC -XX:-UseCompressedOops"
+#JAVA_OPTS="${JAVA_OPTS} -XX:+UseG1GC -XX:-UseCompressedOops"
+JAVA_OPTS="${JAVA_OPTS} -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 -Djava.lang.invoke.VarHandle.VAR_HANDLE_GUARDS=false -Djava.lang.invoke.MethodHandle.DONT_INLINE_THRESHOLD=-1"
+JAVA_OPTS="${JAVA_OPTS} -Dfile.encoding=UTF-8 -Dsun.stdout.encoding=UTF-8 -Dsun.stderr.encoding=UTF-8"
+JAVA_OPTS="${JAVA_OPTS} -Ddev.morling.onebrc.CalculateAverage_linl33.measurementsPath=src/test/resources/samples/measurements-10000-unique-keys.txt"
+
+# create CDS archive
+java ${JAVA_OPTS} -Xshare:off -XX:DumpLoadedClassList=target/${CLASS_NAME}.classlist --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.${CLASS_NAME}
+java ${JAVA_OPTS} -Xshare:dump -XX:SharedClassListFile=target/${CLASS_NAME}.classlist -XX:SharedArchiveFile=target/${CLASS_NAME}.jsa --class-path target/average-1.0.0-SNAPSHOT.jar
+java ${JAVA_OPTS} -Xshare:on -XX:SharedArchiveFile=target/${CLASS_NAME}.jsa -XX:ArchiveClassesAtExit=target/${CLASS_NAME}_dynamic.jsa --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.${CLASS_NAME}
diff --git a/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java b/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java
new file mode 100644
index 000000000..62d54546e
--- /dev/null
+++ b/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java
@@ -0,0 +1,520 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorSpecies;
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.lang.foreign.*;
+import java.lang.invoke.MethodHandle;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.Executors;
+import java.util.stream.IntStream;
+
+public class CalculateAverage_linl33 {
+    private static final String FILE_PATH_PROPERTY = "dev.morling.onebrc.CalculateAverage_linl33.measurementsPath";
+    private static final int WEATHER_STATION_LENGTH_MAX = 100;
+    private static final long WEATHER_STATION_DISTINCT_MAX = 10_000L;
+    private static final int N_THREADS = Runtime.getRuntime().availableProcessors();
+
+    private static final MemorySegment ALL = MemorySegment.NULL.reinterpret(Long.MAX_VALUE);
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
+
+    private static final Thread.Builder THREAD_BUILDER = Thread
+            .ofPlatform()
+            .name("1brc-CalculateAverage-", 0)
+            .inheritInheritableThreadLocals(false);
+
+    private static final Unsafe UNSAFE;
+
+    static {
+        if (ByteOrder.nativeOrder() != ByteOrder.LITTLE_ENDIAN) {
+            throw new UnsupportedOperationException("Error: BE JVMs are not supported");
+        }
+        if ((BYTE_SPECIES.vectorByteSize() & (BYTE_SPECIES.vectorByteSize() - 1)) != 0) {
+            throw new UnsupportedOperationException(STR."Unsupported vectorByteSize \{BYTE_SPECIES.vectorByteSize()}");
+        }
+
+        try {
+            var f = Unsafe.class.getDeclaredField("theUnsafe");
+            f.setAccessible(true);
+            UNSAFE = (Unsafe) f.get(null);
+        } catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static void main() throws InterruptedException, IOException {
+        final var filePath = Paths.get(System.getProperty(FILE_PATH_PROPERTY, "./measurements.txt"));
+
+        try (final var channel = FileChannel.open(filePath)) {
+            final var inputMapped = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), Arena.global());
+
+            final var chunkBounds = calcChunkBounds(inputMapped.address(), inputMapped.byteSize());
+            final var maps = new SparseMap[N_THREADS];
+
+            try (final var threadPool = Executors.newFixedThreadPool(N_THREADS, THREAD_BUILDER.factory());
+                    final var singleThreadExecutor = Executors.newSingleThreadExecutor(Thread.ofVirtual().factory())) {
+                final var rootTask = CompletableFuture.runAsync(new CalculateAverageTask(maps, chunkBounds, 0), threadPool);
+
+                final var futures = IntStream
+                        .range(1, N_THREADS)
+                        .mapToObj(t -> CompletableFuture
+                                .runAsync(new CalculateAverageTask(maps, chunkBounds, t), threadPool)
+                                .runAfterBothAsync(rootTask, () -> maps[0].merge(maps[t]), singleThreadExecutor))
+                        .toArray(CompletableFuture[]::new);
+
+                CompletableFuture.allOf(futures).join();
+            }
+
+            printSorted(maps[0]);
+        }
+    }
+
+    private static long[] calcChunkBounds(final long mappedAddr, final long fileSizeBytes) {
+        final var chunkBounds = new long[N_THREADS + 1];
+        chunkBounds[0] = mappedAddr;
+        chunkBounds[chunkBounds.length - 1] = mappedAddr + fileSizeBytes;
+
+        final var chunkSize = (fileSizeBytes / N_THREADS) & -CalculateAverageTask.BATCH_SIZE_BYTES;
+        for (int i = 1; i < chunkBounds.length - 1; i++) {
+            chunkBounds[i] = chunkBounds[i - 1] + chunkSize;
+        }
+
+        return chunkBounds;
+    }
+
+    private static void printSorted(final SparseMap temperatureMeasurements) {
+        final var weatherStations = new AggregatedMeasurement[(int) temperatureMeasurements.size];
+        final var nameBuffer = new byte[WEATHER_STATION_LENGTH_MAX];
+        var offset = temperatureMeasurements.denseAddress;
+        for (int i = 0; i < weatherStations.length; i++, offset += SparseMap.DATA_SCALE * Long.BYTES) {
+            final var nameAddr = UNSAFE.getLong(offset);
+            final var nameLength = UNSAFE.getInt(offset + Integer.BYTES * 7);
+            MemorySegment.copy(ALL, ValueLayout.JAVA_BYTE, nameAddr, nameBuffer, 0, nameLength);
+            final var nameStr = new String(nameBuffer, 0, nameLength, StandardCharsets.UTF_8);
+            weatherStations[i] = new AggregatedMeasurement(nameStr, i);
+        }
+
+        Arrays.sort(weatherStations);
+
+        System.out.print('{');
+        for (int i = 0; i < weatherStations.length - 1; i++) {
+            printAggMeasurement(weatherStations[i], temperatureMeasurements);
+            System.out.print(',');
+            System.out.print(' ');
+        }
+        printAggMeasurement(weatherStations[weatherStations.length - 1], temperatureMeasurements);
+        System.out.println('}');
+    }
+
+    private static void printAggMeasurement(final AggregatedMeasurement aggMeasurement,
+                                            final SparseMap temperatureMeasurements) {
+        final var offset = temperatureMeasurements.denseAddress + SparseMap.DATA_SCALE * Long.BYTES * aggMeasurement.id();
+
+        // name
+        System.out.print(aggMeasurement.name());
+        System.out.print('=');
+
+        // min
+        printAsDouble(offset + Integer.BYTES * 5);
+        System.out.print('/');
+
+        // mean
+        final double total = UNSAFE.getLong(offset + Integer.BYTES * 2);
+        final var count = UNSAFE.getInt(offset + Integer.BYTES * 4);
+        System.out.print(round(total / count / 10d));
+        System.out.print('/');
+
+        // max
+        printAsDouble(offset + Integer.BYTES * 6);
+    }
+
+    private static void printAsDouble(final long addr) {
+        final var val = (double) UNSAFE.getInt(addr);
+        System.out.print(val / 10d);
+    }
+
+    private static double round(final double d) {
+        return Math.round(d * 10d) / 10d;
+    }
+
+    private static class CalculateAverageTask implements Runnable {
+        public static final int BATCH_SIZE_BYTES = BYTE_SPECIES.vectorByteSize();
+
+        private final SparseMap[] maps;
+        private final long[] chunkBounds;
+        private final long chunkStart;
+        private final long chunkEnd;
+        private final int t;
+
+        private SparseMap map;
+
+        public CalculateAverageTask(SparseMap[] maps, long[] chunkBounds, int t) {
+            this.maps = maps;
+            this.chunkBounds = chunkBounds;
+            this.chunkStart = chunkBounds[t];
+            this.chunkEnd = chunkBounds[t + 1];
+            this.t = t;
+        }
+
+        @Override
+        public void run() {
+            this.maps[this.t] = new SparseMap();
+            this.map = this.maps[this.t];
+
+            var lineStart = this.chunkBounds[0];
+            // walk back to find the previous '\n' and use it as lineStart
+            for (long i = this.chunkStart - 1; i > this.chunkBounds[0]; i--) {
+                if (UNSAFE.getByte(i) == (byte) '\n') {
+                    lineStart = i + 1L;
+                    break;
+                }
+            }
+
+            final var vectorLimit = this.chunkStart + ((this.chunkEnd - this.chunkStart) & -BYTE_SPECIES.vectorByteSize());
+            for (long i = this.chunkStart; i < vectorLimit; i += BYTE_SPECIES.vectorByteSize()) {
+                var lfMask = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, i, ByteOrder.nativeOrder())
+                        .eq((byte) '\n')
+                        .toLong();
+
+                final var lfCount = Long.bitCount(lfMask);
+                for (int j = 0; j < lfCount; j++) {
+                    final var lfPosRelative = Long.numberOfTrailingZeros(lfMask);
+                    final var lfAddress = i + lfPosRelative;
+                    processLine(lineStart, lfAddress);
+
+                    lineStart = lfAddress + 1L;
+                    // unset the lowest set bit, should compile to BLSR
+                    lfMask &= lfMask - 1L;
+                }
+            }
+
+            if (vectorLimit != this.chunkEnd) {
+                processTrailingBytes(lineStart, vectorLimit, this.chunkEnd);
+            }
+        }
+
+        private void processTrailingBytes(long lineStart,
+                                          final long start,
+                                          final long end) {
+            for (long i = start; i < end; i++) {
+                final var b = UNSAFE.getByte(i);
+                if (b != (byte) '\n') {
+                    continue;
+                }
+
+                processLine(lineStart, i);
+                lineStart = i + 1;
+            }
+        }
+
+        private void processLine(final long lineStart, final long lfAddress) {
+            // read 5 bytes before '\n'
+            // the temperature is formatted to 1 decimal place
+            // therefore the shortest temperature value is 0.0
+            // so there are always at least 5 bytes between the location name and '\n'
+            final var trailing5Bytes = UNSAFE.getLong(lfAddress - 5);
+            final int trailingDWordRaw = (int) (trailing5Bytes >>> 8);
+
+            // select the low nibble for each byte, '0'-'9' -> 0-9, ';' -> 11, '-' -> 13
+            final var trailingDWordLowNibble = trailingDWordRaw & 0x0f_0f_0f_0f;
+            // parse the 2 digits around the decimal point (note that these 2 digits must be present)
+            final var trailingDigitsParsed = (trailingDWordLowNibble * 0x00_0a_00_01) >>> 24;
+
+            // this byte must be ('-' & 0xf), (';' & 0xf), or a valid digit (0-9)
+            final var secondHighestByte = trailingDWordLowNibble & 0xf;
+
+            var temperature = trailingDigitsParsed;
+            var lineLength = lfAddress - lineStart - 4;
+
+            if (secondHighestByte > 9) {
+                if (secondHighestByte == ('-' & 0xf)) {
+                    lineLength--;
+                    temperature = -temperature;
+                }
+            }
+            else {
+                lineLength--;
+                temperature += secondHighestByte * 100;
+
+                final var isNegative = (trailing5Bytes & 0xffL) == '-';
+                if (isNegative) {
+                    lineLength--;
+                    temperature = -temperature;
+                }
+            }
+
+            this.map.putEntry(lineStart, (int) lineLength, temperature);
+        }
+    }
+
+    /**
+     * Open addressing, linear probing hash map backed by off-heap memory
+     */
+    private static class SparseMap {
+        private static final int TRUNCATED_HASH_BITS = 26;
+        // max # of unique keys
+        private static final long DENSE_SIZE = WEATHER_STATION_DISTINCT_MAX;
+        // max hash code (exclusive)
+        private static final long SPARSE_SIZE = 1L << (TRUNCATED_HASH_BITS + 1);
+        private static final long DATA_SCALE = 4;
+
+        public final long sparseAddress;
+        public final long denseAddress;
+        public long size;
+
+        public SparseMap() {
+            var arena = new MallocArena(Arena.global());
+            var callocArena = new CallocArena(Arena.global());
+
+            this.size = 0L;
+
+            final var sparse = callocArena.allocate(ValueLayout.JAVA_LONG, SPARSE_SIZE);
+            this.sparseAddress = (sparse.address() + MallocArena.MAX_ALIGN) & -MallocArena.MAX_ALIGN;
+
+            final var dense = arena.allocate(ValueLayout.JAVA_LONG, DENSE_SIZE * DATA_SCALE);
+            this.denseAddress = (dense.address() + MallocArena.MAX_ALIGN) & -MallocArena.MAX_ALIGN;
+        }
+
+        public void putEntry(final long keyAddress, final int keyLength, final int value) {
+            final var hash = hash(keyAddress, keyLength);
+            this.putEntryInternal(hash, keyAddress, keyLength, value, 1, value, value);
+        }
+
+        private void putEntryInternal(final long hash,
+                                      final long keyAddress,
+                                      final int keyLength,
+                                      final long temperature,
+                                      final int count,
+                                      final int temperatureMin,
+                                      final int temperatureMax) {
+            final var sparseOffset = this.sparseAddress + truncateHash(hash) * Long.BYTES;
+
+            for (long n = 0, sparseLinearOffset = sparseOffset; n < WEATHER_STATION_DISTINCT_MAX; n++, sparseLinearOffset += Long.BYTES) {
+                final var denseOffset = UNSAFE.getLong(sparseLinearOffset);
+                if (denseOffset == 0L) {
+                    this.add(sparseLinearOffset, keyAddress, keyLength, temperature, count, temperatureMin, temperatureMax);
+                    this.size++;
+                    return;
+                }
+
+                if (isCollision(keyAddress, keyLength, denseOffset)) {
+                    continue;
+                }
+
+                final var currTotal = UNSAFE.getLong(denseOffset + Integer.BYTES * 2);
+                UNSAFE.putLong(denseOffset + Integer.BYTES * 2, currTotal + temperature); // total
+
+                final var currCount = UNSAFE.getInt(denseOffset + Integer.BYTES * 4);
+                UNSAFE.putInt(denseOffset + Integer.BYTES * 4, currCount + count); // count
+
+                final var currMin = UNSAFE.getInt(denseOffset + Integer.BYTES * 5);
+                if (temperatureMin < currMin) {
+                    UNSAFE.putInt(denseOffset + Integer.BYTES * 5, temperatureMin); // min
+                }
+
+                final var currMax = UNSAFE.getInt(denseOffset + Integer.BYTES * 6);
+                if (temperatureMax > currMax) {
+                    UNSAFE.putInt(denseOffset + Integer.BYTES * 6, temperatureMax); // max
+                }
+
+                return;
+            }
+        }
+
+        public void merge(final SparseMap other) {
+            final var otherSize = other.size;
+            for (long i = 0, offset = other.denseAddress; i < otherSize; i++, offset += DATA_SCALE * Long.BYTES) {
+                final var keyAddress = UNSAFE.getLong(offset);
+                final var keyLength = UNSAFE.getInt(offset + Integer.BYTES * 7);
+                final var hash = hash(keyAddress, keyLength);
+
+                this.putEntryInternal(
+                        hash,
+                        keyAddress,
+                        keyLength,
+                        UNSAFE.getLong(offset + Integer.BYTES * 2),
+                        UNSAFE.getInt(offset + Integer.BYTES * 4),
+                        UNSAFE.getInt(offset + Integer.BYTES * 5),
+                        UNSAFE.getInt(offset + Integer.BYTES * 6));
+            }
+        }
+
+        private void add(final long sparseOffset,
+                         final long keyAddress,
+                         final int keyLength,
+                         final long temperature,
+                         final int count,
+                         final int temperatureMin,
+                         final int temperatureMax) {
+            // new entry, initialize sparse and dense
+            final var denseOffset = this.denseAddress + this.size * DATA_SCALE * Long.BYTES;
+            UNSAFE.putLong(sparseOffset, denseOffset);
+
+            UNSAFE.putLong(denseOffset, keyAddress);
+            UNSAFE.putLong(denseOffset + Integer.BYTES * 2, temperature);
+            UNSAFE.putInt(denseOffset + Integer.BYTES * 4, count);
+            UNSAFE.putInt(denseOffset + Integer.BYTES * 5, temperatureMin);
+            UNSAFE.putInt(denseOffset + Integer.BYTES * 6, temperatureMax);
+            UNSAFE.putInt(denseOffset + Integer.BYTES * 7, keyLength);
+        }
+
+        private static boolean isCollision(final long keyAddress, final int keyLength, final long denseOffset) {
+            // key length compare is unnecessary
+
+            final var entryKeyAddress = UNSAFE.getLong(denseOffset);
+            return mismatch(keyAddress, entryKeyAddress, keyLength);
+        }
+
+        private static boolean mismatch(final long leftAddr, final long rightAddr, final int length) {
+            // key length compare is unnecessary
+            // strings compared through delimiter byte ';'
+
+            final var loopBound = length >= (BYTE_SPECIES.vectorByteSize() - 1) ? ((length + 1) & -BYTE_SPECIES.vectorByteSize()) : 0;
+            for (long i = 0; i < loopBound; i += BYTE_SPECIES.vectorByteSize()) {
+                final var l = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, leftAddr + i, ByteOrder.nativeOrder());
+                final var r = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, rightAddr + i, ByteOrder.nativeOrder());
+                if (!l.eq(r).allTrue()) {
+                    return true;
+                }
+            }
+
+            final var l = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, leftAddr + loopBound, ByteOrder.nativeOrder());
+            final var r = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, rightAddr + loopBound, ByteOrder.nativeOrder());
+            final var eqMask = l.eq(r).toLong();
+
+            // LE compare to add 1 to length
+            return Long.numberOfTrailingZeros(~eqMask) <= (length - loopBound);
+            // to support platforms without TZCNT, the check can be replaced with
+            // a comparison to lowestZero = ~eqMask & (eqMask + 1)
+        }
+
+        // Use the leading and trailing few bytes as hash
+        // this performs better than computing a good hash
+        private static long hash(final long keyAddress, final int keyLength) {
+            final var leadingQWord = UNSAFE.getLong(keyAddress);
+            // the constant is the 64 bit FNV-1 offset basis
+            final var hash = -3750763034362895579L ^ leadingQWord;
+            if (keyLength < Integer.BYTES) {
+                // the key is at least 2 bytes (if you count the delimiter)
+                return hash & 0xffffL;
+            }
+            else {
+                final var trailingDWord = UNSAFE.getLong(keyAddress + keyLength - Integer.BYTES) & 0xffffffffL;
+                // only the lower dword in hash is guaranteed to exist so shift left 32
+                return (hash << Integer.SIZE) ^ trailingDWord;
+            }
+        }
+
+        private static long truncateHash(final long hash) {
+            return ((hash >>> TRUNCATED_HASH_BITS) ^ hash) & ((1L << TRUNCATED_HASH_BITS) - 1L);
+        }
+    }
+
+    private static class MallocArena implements Arena {
+        public static final long MAX_ALIGN = 1L << 21;
+
+        protected static final Linker LINKER = Linker.nativeLinker();
+        protected static final AddressLayout C_POINTER = (AddressLayout) LINKER.canonicalLayouts().get("void*");
+        protected static final ValueLayout C_SIZE_T = (ValueLayout) LINKER.canonicalLayouts().get("size_t");
+        private static final MethodHandle MALLOC = LINKER.downcallHandle(
+                LINKER.defaultLookup().find("malloc").orElseThrow(),
+                FunctionDescriptor.of(C_POINTER, C_SIZE_T),
+                Linker.Option.critical(false));
+        private static final MethodHandle FREE = LINKER.downcallHandle(
+                LINKER.defaultLookup().find("free").orElseThrow(),
+                FunctionDescriptor.ofVoid(C_POINTER),
+                Linker.Option.critical(false));
+        protected static final MethodHandle CALLOC = LINKER.downcallHandle(
+                LINKER.defaultLookup().find("calloc").orElseThrow(),
+                FunctionDescriptor.of(C_POINTER, C_SIZE_T, C_SIZE_T),
+                Linker.Option.critical(false));
+
+        private final Arena arena;
+
+        public MallocArena(Arena arena) {
+            this.arena = arena;
+        }
+
+        @Override
+        public MemorySegment allocate(final long byteSize, final long byteAlignment) {
+            return malloc(byteSize + MAX_ALIGN).reinterpret(this, MallocArena::free);
+        }
+
+        @Override
+        public MemorySegment.Scope scope() {
+            return arena.scope();
+        }
+
+        @Override
+        public void close() {
+            arena.close();
+        }
+
+        private static MemorySegment malloc(final long byteSize) {
+            try {
+                return ((MemorySegment) MALLOC.invokeExact(byteSize)).reinterpret(byteSize);
+            }
+            catch (Throwable e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        protected static void free(final MemorySegment address) {
+            try {
+                FREE.invokeExact(address);
+            }
+            catch (Throwable e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    private static class CallocArena extends MallocArena {
+        public CallocArena(Arena arena) {
+            super(arena);
+        }
+
+        @Override
+        public MemorySegment allocate(final long byteSize, final long byteAlignment) {
+            return calloc(byteSize + MAX_ALIGN).reinterpret(this, MallocArena::free);
+        }
+
+        private static MemorySegment calloc(final long byteSize) {
+            try {
+                return ((MemorySegment) MallocArena.CALLOC.invokeExact(1L, byteSize)).reinterpret(byteSize);
+            }
+            catch (Throwable e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    private record AggregatedMeasurement(String name, long id) implements Comparable<AggregatedMeasurement> {
+
+    @Override
+    public int compareTo(final AggregatedMeasurement other) {
+        return name.compareTo(other.name);
+    }
+}}

From b374643a1f2ca938063c006f5c37668383ebb088 Mon Sep 17 00:00:00 2001
From: Jason Nochlin <91577+hundredwatt@users.noreply.github.com>
Date: Mon, 22 Jan 2024 01:20:26 -0700
Subject: [PATCH 099/268] evaluate.sh: Add note for "using Unsafe" (#547)

* fix typo

* automatically label entries using Unsafe

* fix for entry in src/main/java-22/

* backfill leaderboard

---------

Co-authored-by: Jason Nochlin <hundredwatt@users.noreply.github.com>
---
 README.md   | 54 ++++++++++++++++++++++++++---------------------------
 evaluate.sh |  8 +++++++-
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 9cb9cf9b3..0fa040072 100644
--- a/README.md
+++ b/README.md
@@ -41,51 +41,51 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary |
-| 2 | 00:02.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary |
-| 3* | 00:02.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
-| 3* | 00:02.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary |
-|   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | Quan Anh Mai's implementation, using `Unsafe` |
-|   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) |  |
+| 1 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2 | 00:02.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
+|   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
-|   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
+|   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
-|   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) |  |
+|   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
-|   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) |  |
-|   | 00:03.990 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
-|   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) |  |
-|   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
-|   | 00:04.551 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) |  |
-|   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
-|   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
+|   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
+|   | 00:03.990 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
+|   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
+|   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
+|   | 00:04.551 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | uses Unsafe |
+|   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
+|   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | uses Unsafe |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
+|   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) | uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
-|   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
-|   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) |  |
-|   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
+|   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
+|   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
+|   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
-|   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) |  |
-|   | 00:06.415 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) |  |
+|   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |
+|   | 00:06.415 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) | uses Unsafe |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
-|   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) |  |
+|   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) | uses Unsafe |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:06.872 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
-|   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) |  |
+|   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:07.913 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [parkertimmins](https://github.com/parkertimmins) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
-|   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
+|   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) | uses Unsafe |
 |   | 00:08.489 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnabyl.java)| 21.0.1-graal | [Bang NGUYEN](https://github.com/gnabyl) |  |
-|   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) |  |
+|   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) | uses Unsafe |
 |   | 00:08.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [Adrià Cabeza](https://github.com/adriacabeza) |  |
 |   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
 |   | 00:08.689 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
@@ -97,7 +97,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:09.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:09.945 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
 |   | 00:10.092 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-graal | [Pratham](https://github.com/phd3) |  |
-|   | 00:10.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) |  |
+|   | 00:10.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) | uses Unsafe |
 |   | 00:11.577 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
 |   | 00:11.119 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_lawrey.java)| 21.0.1-open | [lawrey](https://github.com/lawrey) |  |
@@ -105,7 +105,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
-|   | 00:11.878 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) |  |
+|   | 00:11.878 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) | uses Unsafe |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
 |   | 00:12.051 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
 |   | 00:12.102 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java)| java | [Yann Moisan](https://github.com/YannMoisan) |  |
diff --git a/evaluate.sh b/evaluate.sh
index caafbb15c..c0be8b50b 100755
--- a/evaluate.sh
+++ b/evaluate.sh
@@ -19,7 +19,7 @@ set -eo pipefail
 
 if [ -z "$1" ]
   then
-    echo "Usage: evaluate2.sh <fork name> (<fork name 2> ...)"
+    echo "Usage: evaluate.sh <fork name> (<fork name 2> ...)"
     echo " for each fork, there must be a 'calculate_average_<fork name>.sh' script and an optional 'prepare_<fork name>.sh'."
     exit 1
 fi
@@ -271,6 +271,12 @@ for fork in "$@"; do
     fi
   fi
 
+  # check if Java source file uses Unsafe
+  if grep -F "theUnsafe" -q ./src/main/java*/dev/morling/onebrc/CalculateAverage_$fork.java ; then
+    # if notes is not empty, append a comma and space before the unsafe note
+    notes="${notes:+$notes, }uses Unsafe"
+  fi
+
   echo -n "$trimmed_mean;" >> $leaderboard_temp_file # for sorting
   echo -n "| # " >> $leaderboard_temp_file
   echo -n "| $trimmed_mean_formatted " >> $leaderboard_temp_file

From e16ad7660d051972bac7fcdcec3fd92f6c4445da Mon Sep 17 00:00:00 2001
From: Sarkie <Sarkie@users.noreply.github.com>
Date: Mon, 22 Jan 2024 10:00:01 +0000
Subject: [PATCH 100/268] Update README.md to add the _baseline (#552)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0fa040072..3ff82220b 100644
--- a/README.md
+++ b/README.md
@@ -318,7 +318,7 @@ To submit your own implementation to 1BRC, follow these steps:
 
 * Create a fork of the [onebrc](https://github.com/gunnarmorling/onebrc/) GitHub repository.
 * Run `./create_fork.sh <your_GH_user>` to copy the baseline implementation to your personal files, or do this manually:
-  * Create a copy of _CalculateAverage.java_, named _CalculateAverage\_<your_GH_user>.java_, e.g. _CalculateAverage\_doloreswilson.java_.
+  * Create a copy of _CalculateAverage\_baseline.java_, named _CalculateAverage\_<your_GH_user>.java_, e.g. _CalculateAverage\_doloreswilson.java_.
   * Create a copy of _calculate\_average\_baseline.sh_, named _calculate\_average\_<your_GH_user>.sh_, e.g. _calculate\_average\_doloreswilson.sh_.
   * Adjust that script so that it references your implementation class name. If needed, provide any JVM arguments via the `JAVA_OPTS` variable in that script.
     Make sure that script does not write anything to standard output other than calculation results.

From df891354d303c0d8e7e6c2558d34544065318d41 Mon Sep 17 00:00:00 2001
From: Elliot Barlas <elliotbarlas@gmail.com>
Date: Tue, 23 Jan 2024 07:31:04 -0800
Subject: [PATCH 101/268] Inline and optimize value parsing code for each of
 the four semicolon position processing branches. This provides a small but
 noticeable speed-up. It also expands and obfuscates the code, unfortunately.
 (#563)

---
 .../onebrc/CalculateAverage_ebarlas.java      | 159 ++++++++++++------
 1 file changed, 112 insertions(+), 47 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
index 87bba124b..3d8ad900d 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java
@@ -185,48 +185,114 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
             long keyAddr = keyBaseAddr; // address for next int
             int keyArrLen = 0; // number of key 4-byte ints
             int keyLastBytes; // occupancy in last byte (1, 2, 3, or 4)
-            byte b0, b1, b2, b3;
+            int val;
             while (true) {
                 int n = UNSAFE.getInt(cursor);
                 cursor += 4;
-                b0 = (byte) (n & 0xFF);
-                b1 = (byte) ((n >> 8) & 0xFF);
-                b2 = (byte) ((n >> 16) & 0xFF);
-                b3 = (byte) ((n >> 24) & 0xFF);
-                if (b0 == ';') { // ...;1.1
+                if ((n & 0xFF) == ';') { // ;vvv
                     UNSAFE.putInt(keyAddr, 0); // always pad with extra int to facilitate 8-byte aligned comparisons
                     keyLastBytes = 4;
-                    b0 = b1;
-                    b1 = b2;
-                    b2 = b3;
-                    b3 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    byte b0 = (byte) ((n >> 8) & 0xFF);
+                    byte b1 = (byte) ((n >> 16) & 0xFF);
+                    byte b2 = (byte) ((n >> 24) & 0xFF);
+                    if (b0 == '-') {
+                        if (b2 != '.') { // 6 bytes: -dd.dn
+                            cursor++; // decimal point
+                            byte b4 = UNSAFE.getByte(cursor);
+                            cursor += 2; // adv beyond digit and newline
+                            val = -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b4 - '0'));
+                        }
+                        else { // 5 bytes: -d.dn
+                            byte b3 = UNSAFE.getByte(cursor);
+                            cursor += 2; // digit and newline
+                            val = -((b1 - '0') * 10 + (b3 - '0'));
+                        }
+                    }
+                    else {
+                        if (b1 != '.') { // 5 bytes: dd.dn
+                            var b3 = UNSAFE.getByte(cursor);
+                            cursor += 2; // digit and newline
+                            val = ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
+                        }
+                        else { // 4 bytes: d.dn
+                            cursor++; // newline
+                            val = (b0 - '0') * 10 + (b2 - '0');
+                        }
+                    }
                     break;
                 }
-                else if (b1 == ';') { // ...a;1.1
+                else if ((n & 0xFF00) == 0x3b00) { // k;vv
                     int k = n & 0xFF;
                     UNSAFE.putLong(keyAddr, k); // pad with extra int for comparison alignment
                     keyLastBytes = 1;
                     keyArrLen++;
                     keyHash += k;
-                    b0 = b2;
-                    b1 = b3;
-                    b2 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
-                    b3 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    byte b0 = (byte) ((n >> 16) & 0xFF);
+                    byte b1 = (byte) ((n >> 24) & 0xFF);
+                    byte b2 = UNSAFE.getByte(cursor++);
+                    if (b0 == '-') {
+                        if (b2 != '.') { // 6 bytes: -dd.dn
+                            cursor++; // decimal point
+                            byte b4 = UNSAFE.getByte(cursor);
+                            cursor += 2; // adv beyond digit and newline
+                            val = -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b4 - '0'));
+                        }
+                        else { // 5 bytes: -d.dn
+                            byte b3 = UNSAFE.getByte(cursor);
+                            cursor += 2; // digit newline
+                            val = -((b1 - '0') * 10 + (b3 - '0'));
+                        }
+                    }
+                    else {
+                        if (b1 != '.') { // 5 bytes: dd.dn
+                            byte b3 = UNSAFE.getByte(cursor);
+                            cursor += 2; // newline
+                            val = ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
+                        }
+                        else { // 4 bytes: d.dn
+                            cursor++;
+                            val = (b0 - '0') * 10 + (b2 - '0');
+                        }
+                    }
                     break;
                 }
-                else if (b2 == ';') { // ...ab;1.1
+                else if ((n & 0xFF0000) == 0x3b0000) { // kk;v
                     int k = n & 0xFFFF;
                     UNSAFE.putLong(keyAddr, k); // pad with extra int for comparison alignment
                     keyLastBytes = 2;
                     keyArrLen++;
                     keyHash += k;
-                    b0 = b3;
-                    b1 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
-                    b2 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
-                    b3 = (byte) (UNSAFE.getByte(cursor++) & 0xFF);
+                    byte b0 = (byte) ((n >> 24) & 0xFF);
+                    if (b0 == '-') {
+                        n = UNSAFE.getInt(cursor);
+                        cursor += 4;
+                        byte b1 = (byte) (n & 0xFF);
+                        byte b2 = (byte) ((n >> 8) & 0xFF);
+                        byte b3 = (byte) ((n >> 16) & 0xFF);
+                        if (b2 != '.') { // 6 bytes: -dd.dn
+                            byte b4 = (byte) ((n >> 24) & 0xFF);
+                            cursor++; // newline
+                            val = -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b4 - '0'));
+                        }
+                        else { // 5 bytes: -d.dn
+                            val = -((b1 - '0') * 10 + (b3 - '0'));
+                        }
+                    }
+                    else {
+                        byte b1 = UNSAFE.getByte(cursor++);
+                        byte b2 = UNSAFE.getByte(cursor++);
+                        byte b3 = UNSAFE.getByte(cursor++);
+                        if (b1 != '.') { // 5 bytes: dd.dn
+                            cursor++; // newline
+                            val = ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
+                        }
+                        else { // 4 bytes: d.dn
+                            val = (b0 - '0') * 10 + (b2 - '0');
+                        }
+                    }
                     break;
                 }
-                else if (b3 == ';') { // ...abc;1.1
+                else if ((n & 0xFF000000) == 0x3b000000) { // kkk;
                     int k = n & 0xFFFFFF;
                     UNSAFE.putLong(keyAddr, k); // pad with extra int for comparison alignment
                     keyLastBytes = 3;
@@ -234,13 +300,33 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
                     keyHash += k;
                     n = UNSAFE.getInt(cursor);
                     cursor += 4;
-                    b0 = (byte) (n & 0xFF);
-                    b1 = (byte) ((n >> 8) & 0xFF);
-                    b2 = (byte) ((n >> 16) & 0xFF);
-                    b3 = (byte) ((n >> 24) & 0xFF);
+                    byte b0 = (byte) (n & 0xFF);
+                    byte b1 = (byte) ((n >> 8) & 0xFF);
+                    byte b2 = (byte) ((n >> 16) & 0xFF);
+                    byte b3 = (byte) ((n >> 24) & 0xFF);
+                    if (b0 == '-') {
+                        if (b2 != '.') { // 6 bytes: -dd.dn
+                            byte b4 = UNSAFE.getByte(cursor);
+                            cursor += 2; // adv beyond digit and newline
+                            val = -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b4 - '0'));
+                        }
+                        else { // 5 bytes: -d.dn
+                            cursor++; // newline
+                            val = -((b1 - '0') * 10 + (b3 - '0'));
+                        }
+                    }
+                    else {
+                        if (b1 != '.') { // 5 bytes: dd.dn
+                            cursor++; // newline
+                            val = ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
+                        }
+                        else { // 4 bytes: d.dn
+                            val = (b0 - '0') * 10 + (b2 - '0');
+                        }
+                    }
                     break;
                 }
-                else {
+                else { // kkkk
                     UNSAFE.putInt(keyAddr, n);
                     keyArrLen++;
                     keyAddr += 4;
@@ -256,27 +342,6 @@ private static long doProcessSegment(MemorySegment ms, long offset, Stats[] stat
             else if (!equals(st.keyAddr, st.keyLen, keyBaseAddr, keyArrLen)) {
                 st = findInTable(stats, keyHash, keyBaseAddr, keyArrLen, keyLastBytes);
             }
-            int val;
-            if (b0 == '-') {
-                if (b2 != '.') { // 6 bytes: -dd.dn
-                    var b = UNSAFE.getByte(cursor);
-                    cursor += 2; // adv beyond digit and newline
-                    val = -(((b1 - '0') * 10 + (b2 - '0')) * 10 + (b - '0'));
-                }
-                else { // 5 bytes: -d.dn
-                    cursor++; // newline
-                    val = -((b1 - '0') * 10 + (b3 - '0'));
-                }
-            }
-            else {
-                if (b1 != '.') { // 5 bytes: dd.dn
-                    cursor++; // newline
-                    val = ((b0 - '0') * 10 + (b1 - '0')) * 10 + (b3 - '0');
-                }
-                else { // 4 bytes: d.dn
-                    val = (b0 - '0') * 10 + (b2 - '0');
-                }
-            }
             st.min = Math.min(st.min, val);
             st.max = Math.max(st.max, val);
             st.sum += val;

From f7febea2f6277263665365a4cbd0b36343159245 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antonio=20Mu=C3=B1oz?= <antoniogmc@gmail.com>
Date: Tue, 23 Jan 2024 16:35:05 +0100
Subject: [PATCH 102/268] tonivade implementation (try 2) (#541)

* tonivade implementation

* synchronized block performs better than ReentrantLock

* remove ConcurrentHashMap

* refactor

* use HashMap.newHashMap

* change double to int

* minor refactor

* fix
---
 calculate_average_tonivade.sh                 |  19 ++
 prepare_tonivade.sh                           |  20 ++
 .../onebrc/CalculateAverage_tonivade.java     | 268 ++++++++++++++++++
 3 files changed, 307 insertions(+)
 create mode 100755 calculate_average_tonivade.sh
 create mode 100755 prepare_tonivade.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java

diff --git a/calculate_average_tonivade.sh b/calculate_average_tonivade.sh
new file mode 100755
index 000000000..5e160f9ff
--- /dev/null
+++ b/calculate_average_tonivade.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xmx1G -Xms1G -XX:+AlwaysPreTouch --enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_tonivade
diff --git a/prepare_tonivade.sh b/prepare_tonivade.sh
new file mode 100755
index 000000000..66b23f679
--- /dev/null
+++ b/prepare_tonivade.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-tem 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java b/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java
new file mode 100644
index 000000000..bd284888a
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java
@@ -0,0 +1,268 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import static java.util.Comparator.comparing;
+import static java.util.stream.Collectors.joining;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.StructuredTaskScope;
+import java.util.concurrent.StructuredTaskScope.Subtask;
+
+public class CalculateAverage_tonivade {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static final int EOL = 10;
+    private static final int MINUS = 45;
+    private static final int SEMICOLON = 59;
+
+    public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
+        var result = readFile();
+
+        var measurements = getMeasurements(result);
+
+        System.out.println(measurements);
+    }
+
+    static record PartialResult(int end, Map<Name, Station> map) {
+
+        void merge(Map<Name, Station> result) {
+            map.forEach((name, station) -> result.merge(name, station, Station::merge));
+        }
+    }
+
+    private static String getMeasurements(Map<Name, Station> result) {
+        return result.values().stream().sorted(comparing(Station::getName))
+                .map(Station::asString).collect(joining(", ", "{", "}"));
+    }
+
+    private static Map<Name, Station> readFile() throws IOException, InterruptedException, ExecutionException {
+        Map<Name, Station> result = HashMap.newHashMap(10_000);
+        try (var channel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {
+            long consumed = 0;
+            long remaining = channel.size();
+            while (remaining > 0) {
+                var buffer = channel.map(
+                        MapMode.READ_ONLY, consumed, Math.min(remaining, Integer.MAX_VALUE));
+
+                if (buffer.remaining() <= 1024) {
+                    var partialResult = readChunk(buffer, 0, buffer.remaining());
+
+                    consumed += partialResult.end();
+                    remaining -= partialResult.end();
+
+                    partialResult.merge(result);
+                }
+                else {
+                    var chunks = Runtime.getRuntime().availableProcessors();
+                    var chunksSize = buffer.remaining() / chunks;
+                    var leftover = buffer.remaining() % chunks;
+
+                    try (var scope = new StructuredTaskScope.ShutdownOnFailure()) {
+                        var tasks = new ArrayList<Subtask<PartialResult>>(chunks);
+                        for (int i = 0; i < chunks; i++) {
+                            int start = i * chunksSize;
+                            int length = chunksSize + (i < chunks ? leftover : 0);
+                            tasks.add(scope.fork(() -> readChunk(
+                                    buffer, findStart(buffer, start), start + length)));
+                        }
+                        scope.join();
+                        scope.throwIfFailed();
+
+                        for (var subtask : tasks) {
+                            subtask.get().merge(result);
+                        }
+                        consumed += tasks.getLast().get().end();
+                        remaining -= tasks.getLast().get().end();
+                    }
+                }
+            }
+        }
+        return result;
+    }
+
+    private static PartialResult readChunk(ByteBuffer buffer, int start, int end) {
+        final byte[] name = new byte[128];
+        final byte[] temp = new byte[8];
+        final Map<Name, Station> map = HashMap.newHashMap(1000);
+        int position = start;
+        while (position < end) {
+            int semicolon = readName(buffer, position, end - position, name);
+            if (semicolon < 0) {
+                break;
+            }
+
+            int endOfLine = readTemp(buffer, semicolon + 1, end - semicolon - 1, temp);
+            if (endOfLine < 0) {
+                break;
+            }
+
+            map.computeIfAbsent(new Name(name, semicolon - position), Station::new)
+                    .add(parseTemp(temp, endOfLine - semicolon - 1));
+
+            // skip end of line
+            position = endOfLine + 1;
+        }
+        return new PartialResult(position, map);
+    }
+
+    private static int findStart(ByteBuffer buffer, int start) {
+        if (start > 0 && buffer.get(start - 1) != EOL) {
+            for (int i = start - 2; i > 0; i--) {
+                byte b = buffer.get(i);
+                if (b == EOL) {
+                    return i + 1;
+                }
+            }
+        }
+        return start;
+    }
+
+    private static int readName(ByteBuffer buffer, int offset, int length, byte[] name) {
+        return readUntil(buffer, offset, length, name, SEMICOLON);
+    }
+
+    private static int readTemp(ByteBuffer buffer, int offset, int length, byte[] percentage) {
+        return readUntil(buffer, offset, length, percentage, EOL);
+    }
+
+    private static int readUntil(ByteBuffer buffer, int offset, int length, byte[] array, int target) {
+        for (int i = 0; i < length; i++) {
+            byte b = buffer.get(i + offset);
+            if (b == target) {
+                return i + offset;
+            }
+            array[i] = b;
+        }
+        return -1;
+    }
+
+    // non null double between -99.9 (inclusive) and 99.9 (inclusive), always with one fractional digit
+    private static int parseTemp(byte[] value, int length) {
+        int period = length - 2;
+        if (value[0] == MINUS) {
+            int left = parseLeft(value, 1, period - 1);
+            int right = toInt(value[period + 1]);
+            return -(left + right);
+        }
+        int left = parseLeft(value, 0, period);
+        int right = toInt(value[period + 1]);
+        return left + right;
+    }
+
+    private static int parseLeft(byte[] value, int start, int length) {
+        if (length == 1) {
+            return toInt(value[start]) * 10;
+        }
+        // two chars
+        int a = toInt(value[start]) * 100;
+        int b = toInt(value[start + 1]) * 10;
+        return a + b;
+    }
+
+    private static int toInt(byte c) {
+        return c - 48;
+    }
+
+    static final class Name {
+
+        private final byte[] value;
+
+        Name(byte[] source, int length) {
+            value = new byte[length];
+            System.arraycopy(source, 0, value, 0, length);
+        }
+
+        @Override
+        public int hashCode() {
+            return Arrays.hashCode(value);
+        }
+
+        @Override
+        public boolean equals(Object obj) {
+            if (obj instanceof Name other) {
+                return Arrays.equals(value, other.value);
+            }
+            return false;
+        }
+
+        @Override
+        public String toString() {
+            return new String(value, StandardCharsets.UTF_8);
+        }
+    }
+
+    static final class Station {
+
+        private final Name name;
+
+        private int min = Integer.MAX_VALUE;
+        private int max = Integer.MIN_VALUE;
+        private int sum;
+        private long count;
+
+        Station(Name name) {
+            this.name = name;
+        }
+
+        String getName() {
+            return name.toString();
+        }
+
+        void add(int value) {
+            min = Math.min(min, value);
+            max = Math.max(max, value);
+            sum += value;
+            count++;
+        }
+
+        Station merge(Station other) {
+            min = Math.min(min, other.min);
+            max = Math.max(max, other.max);
+            sum += other.sum;
+            count += other.count;
+            return this;
+        }
+
+        String asString() {
+            return name + "=" + toDouble(min) + "/" + round(mean()) + "/" + toDouble(max);
+        }
+
+        private double mean() {
+            return toDouble(sum) / count;
+        }
+
+        private double toDouble(int value) {
+            return value / 10.;
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.) / 10.;
+        }
+    }
+}

From 8bae1b87810f75ddf307ba0b84400d97e3e6f851 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:37:33 +0000
Subject: [PATCH 103/268] Use simd for name comparison (#568)

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 .../CalculateAverage_ianopolousfast.java      | 119 +++++-------------
 1 file changed, 32 insertions(+), 87 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index 8944a472f..f1b4e7bf5 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -34,7 +34,7 @@
 /* A fast implementation with no unsafe.
  * Features:
  * * memory mapped file using preview Arena FFI
- * * semicolon finding using incubator vector api
+ * * semicolon finding and name comparison using incubator vector api
  * * read chunks in parallel
  * * minimise allocation
  * * no unsafe
@@ -80,12 +80,11 @@ public static void main(String[] args) throws Exception {
         System.out.println(merged);
     }
 
-    public static boolean matchingStationBytes(long start, long end, int offset, MemorySegment buffer, Stat existing) {
-        int len = (int) (end - start);
-        if (len != existing.name.length)
-            return false;
-        for (int i = offset; i < len; i++) {
-            if (existing.name[i] != buffer.get(JAVA_BYTE, offset + start++))
+    public static boolean matchingStationBytes(long start, long end, MemorySegment buffer, Stat existing) {
+        for (int index = 0; index < end - start; index += BYTE_SPECIES.vectorByteSize()) {
+            ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, start + index, ByteOrder.nativeOrder(), BYTE_SPECIES.indexInRange(start + index, end));
+            ByteVector found = ByteVector.fromArray(BYTE_SPECIES, existing.name, index);
+            if (!found.eq(line).allTrue())
                 return false;
         }
         return true;
@@ -98,21 +97,19 @@ private static int hashToIndex(long hash, int len) {
         return (finalHash & (len - 1));
     }
 
-    public static Stat parseStation(long start, long end, long first8, long second8,
-                                    MemorySegment buffer) {
+    public static Stat createStation(long start, long end, MemorySegment buffer) {
         byte[] stationBuffer = new byte[(int) (end - start)];
         for (long off = start; off < end; off++)
             stationBuffer[(int) (off - start)] = buffer.get(JAVA_BYTE, off);
-        return new Stat(stationBuffer, first8, second8);
+        return new Stat(stationBuffer);
     }
 
-    public static Stat dedupeStation(long start, long end, long hash, long first8, long second8,
-                                     MemorySegment buffer, List<List<Stat>> stations) {
+    public static Stat dedupeStation(long start, long end, long hash, MemorySegment buffer, List<List<Stat>> stations) {
         int index = hashToIndex(hash, MAX_STATIONS);
         List<Stat> matches = stations.get(index);
         if (matches == null) {
             List<Stat> value = new ArrayList<>();
-            Stat res = parseStation(start, end, first8, second8, buffer);
+            Stat res = createStation(start, end, buffer);
             value.add(res);
             stations.set(index, value);
             return res;
@@ -120,54 +117,10 @@ public static Stat dedupeStation(long start, long end, long hash, long first8, l
         else {
             for (int i = 0; i < matches.size(); i++) {
                 Stat s = matches.get(i);
-                if (first8 == s.first8 && second8 == s.second8 && matchingStationBytes(start, end, 16, buffer, s))
+                if (matchingStationBytes(start, end, buffer, s))
                     return s;
             }
-            Stat res = parseStation(start, end, first8, second8, buffer);
-            matches.add(res);
-            return res;
-        }
-    }
-
-    public static Stat dedupeStation8(long start, long end, long hash, long first8, MemorySegment buffer, List<List<Stat>> stations) {
-        int index = hashToIndex(hash, MAX_STATIONS);
-        List<Stat> matches = stations.get(index);
-        if (matches == null) {
-            List<Stat> value = new ArrayList<>();
-            Stat station = parseStation(start, end, first8, 0, buffer);
-            value.add(station);
-            stations.set(index, value);
-            return station;
-        }
-        else {
-            for (int i = 0; i < matches.size(); i++) {
-                Stat s = matches.get(i);
-                if (first8 == s.first8 && s.name.length <= 8)
-                    return s;
-            }
-            Stat station = parseStation(start, end, first8, 0, buffer);
-            matches.add(station);
-            return station;
-        }
-    }
-
-    public static Stat dedupeStation16(long start, long end, long hash, long first8, long second8, MemorySegment buffer, List<List<Stat>> stations) {
-        int index = hashToIndex(hash, MAX_STATIONS);
-        List<Stat> matches = stations.get(index);
-        if (matches == null) {
-            List<Stat> value = new ArrayList<>();
-            Stat res = parseStation(start, end, first8, second8, buffer);
-            value.add(res);
-            stations.set(index, value);
-            return res;
-        }
-        else {
-            for (int i = 0; i < matches.size(); i++) {
-                Stat s = matches.get(i);
-                if (first8 == s.first8 && second8 == s.second8 && s.name.length <= 16)
-                    return s;
-            }
-            Stat res = parseStation(start, end, first8, second8, buffer);
+            Stat res = createStation(start, end, buffer);
             matches.add(res);
             return res;
         }
@@ -181,32 +134,22 @@ public static Stat parseStation(long lineStart, MemorySegment buffer, List<List<
         ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart, ByteOrder.nativeOrder());
         int keySize = line.compare(VectorOperators.EQ, ';').firstTrue();
 
+        long first8 = buffer.get(LONG_LAYOUT, lineStart);
         if (keySize == BYTE_SPECIES.vectorByteSize()) {
             while (buffer.get(JAVA_BYTE, lineStart + keySize) != ';') {
                 keySize++;
             }
-            long first8 = buffer.get(LONG_LAYOUT, lineStart);
-            if (keySize < 8)
-                return dedupeStation8(lineStart, lineStart + keySize, first8, first8, buffer, stations);
             long second8 = buffer.get(LONG_LAYOUT, lineStart + 8);
-            if (keySize < 16)
-                return dedupeStation16(lineStart, lineStart + keySize, first8 ^ second8, first8, second8, buffer, stations);
             long hash = first8 ^ second8; // todo include other bytes
-            return dedupeStation(lineStart, lineStart + keySize, hash, first8, second8, buffer, stations);
+            return dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
         }
 
-        long first8 = buffer.get(LONG_LAYOUT, lineStart);
         if (keySize <= 8) {
             first8 = maskHighBytes(first8, keySize & 0x07);
-            return dedupeStation8(lineStart, lineStart + keySize, first8, first8, buffer, stations);
-        }
-        long second8 = buffer.get(LONG_LAYOUT, lineStart + 8);
-        if (keySize < 16) {
-            second8 = maskHighBytes(second8, keySize & 0x07);
-            return dedupeStation16(lineStart, lineStart + keySize, first8 ^ second8, first8, second8, buffer, stations);
         }
+        long second8 = keySize <= 8 ? 0 : maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
         long hash = first8 ^ second8; // todo include later bytes
-        return dedupeStation(lineStart, lineStart + keySize, hash, first8, second8, buffer, stations);
+        return dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
     }
 
     public static int getDot(long d) {
@@ -261,13 +204,10 @@ public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySe
         // in the inner loop (reducing branches)
         // We need at least the vector lane size bytes back
         if (endByte == buffer.byteSize()) {
-            endByte -= 1; // skip final new line
             // reverse at least vector lane width
-            while (endByte > 0 && buffer.byteSize() - endByte < BYTE_SPECIES.vectorByteSize()) {
+            endByte = Math.max(buffer.byteSize() - BYTE_SPECIES.vectorByteSize(), 0);
+            while (endByte > 0 && buffer.get(JAVA_BYTE, endByte) != '\n')
                 endByte--;
-                while (endByte > 0 && buffer.get(JAVA_BYTE, endByte) != '\n')
-                    endByte--;
-            }
 
             if (endByte > 0)
                 endByte++;
@@ -278,28 +218,33 @@ public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySe
             int index = 0;
             while (endByte + index < buffer.byteSize()) {
                 Stat station = parseStation(index, end, stations);
-                index = (int) processTemperature(index + station.name.length + 1, end, station);
+                index = (int) processTemperature(index + station.namelen + 1, end, station);
             }
         }
 
+        innerloop(startByte, endByte, buffer, stations);
+        return stations;
+    }
+
+    private static void innerloop(long startByte, long endByte, MemorySegment buffer, List<List<Stat>> stations) {
         while (startByte < endByte) {
             Stat station = parseStation(startByte, buffer, stations);
-            startByte = processTemperature(startByte + station.name.length + 1, buffer, station);
+            startByte = processTemperature(startByte + station.namelen + 1, buffer, station);
         }
-        return stations;
     }
 
     public static class Stat {
         final byte[] name;
+        final int namelen;
         int count = 0;
         short min = Short.MAX_VALUE, max = Short.MIN_VALUE;
         long total = 0;
-        final long first8, second8;
 
-        public Stat(byte[] name, long first8, long second8) {
-            this.name = name;
-            this.first8 = first8;
-            this.second8 = second8;
+        public Stat(byte[] name) {
+            int vecSize = BYTE_SPECIES.vectorByteSize();
+            int arrayLen = (name.length + vecSize - 1) / vecSize * vecSize;
+            this.name = Arrays.copyOfRange(name, 0, arrayLen);
+            this.namelen = name.length;
         }
 
         public void add(short value) {
@@ -326,7 +271,7 @@ private static double round(double value) {
         }
 
         public String name() {
-            return new String(name);
+            return new String(Arrays.copyOfRange(name, 0, namelen));
         }
 
         public String toString() {

From 98a8279669d0483b59cc40b8809e654758b5ad54 Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Wed, 24 Jan 2024 00:41:25 +0900
Subject: [PATCH 104/268] use thomaswue trick, use parallelism, slightly faster
 (#560)

---
 prepare_abeobk.sh                             |   4 +-
 .../onebrc/CalculateAverage_abeobk.java       | 157 +++++++++++-------
 2 files changed, 97 insertions(+), 64 deletions(-)

diff --git a/prepare_abeobk.sh b/prepare_abeobk.sh
index fac7b87b0..d8ed86a1a 100755
--- a/prepare_abeobk.sh
+++ b/prepare_abeobk.sh
@@ -16,10 +16,10 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_abeobk_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=128m --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=128m -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index 48d9da687..293a88caf 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -24,8 +24,12 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 import java.util.TreeMap;
+import java.util.stream.IntStream;
+
 import sun.misc.Unsafe;
 
 public class CalculateAverage_abeobk {
@@ -66,22 +70,23 @@ static class Node {
         long addr;
         long word0;
         long tail;
-        int keylen;
-        int min, max;
-        int count;
         long sum;
+        int count;
+        short min, max;
+        int keylen;
+        String key;
 
-        String key() {
+        void calcKey() {
             byte[] sbuf = new byte[MAX_STR_LEN];
             UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
-            return new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
+            key = new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
         }
 
         public String toString() {
             return String.format("%.1f/%.1f/%.1f", min * 0.1, sum * 0.1 / count, max * 0.1);
         }
 
-        Node(long a, long t, int val, int kl) {
+        Node(long a, long t, short val, int kl) {
             addr = a;
             tail = t;
             keylen = kl;
@@ -89,12 +94,16 @@ public String toString() {
             count = 1;
         }
 
-        Node(long a, long t, int val, int kl, long w0) {
-            this(a, t, val, kl);
+        Node(long a, long w0, long t, short val, int kl) {
+            addr = a;
             word0 = w0;
+            tail = t;
+            keylen = kl;
+            sum = min = max = val;
+            count = 1;
         }
 
-        void add(int val) {
+        void add(short val) {
             sum += val;
             count++;
             if (val >= max) {
@@ -107,19 +116,23 @@ void add(int val) {
         }
 
         void merge(Node other) {
-            min = Math.min(min, other.min);
-            max = Math.max(max, other.max);
             sum += other.sum;
             count += other.count;
+            if (other.max > max) {
+                max = other.max;
+            }
+            if (other.min < min) {
+                min = other.min;
+            }
         }
 
-        boolean contentEquals(long other_addr, long other_tail) {
-            if (tail != other_tail)
+        boolean contentEquals(long other_addr, long other_word0, long other_tail) {
+            if (tail != other_tail || word0 != other_word0)
                 return false;
             // this is faster than comparision if key is short
             long xsum = 0;
             int n = keylen & 0xF8;
-            for (int i = 0; i < n; i += 8) {
+            for (int i = 8; i < n; i += 8) {
                 xsum |= (UNSAFE.getLong(addr + i) ^ UNSAFE.getLong(other_addr + i));
             }
             return xsum == 0;
@@ -156,29 +169,27 @@ static final int xxh32(long hash) {
     }
 
     // great idea from merykitty (Quan Anh Mai)
-    static final int parseNum(long num_word, int dot_pos) {
+    static final short parseNum(long num_word, int dot_pos) {
         int shift = 28 - dot_pos;
         long signed = (~num_word << 59) >> 63;
         long dsmask = ~(signed & 0xFF);
         long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
         long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-        return (int) ((abs_val ^ signed) - signed);
+        return (short) ((abs_val ^ signed) - signed);
     }
 
     // optimize for contest
     // save as much slow memory access as possible
     // about 50% key < 8chars, 25% key bettween 8-10 chars
     // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
-    static final Node[] parse(int thread_id, long start, long end, int[] cls) {
+    static final Node[] parse(int thread_id, long start, long end) {
+        int cls = 0;
         long addr = start;
         var map = new Node[BUCKET_SIZE + 10000]; // extra space for collisions
         // parse loop
         while (addr < end) {
             long row_addr = addr;
-            long tail = 0;
             long hash = 0;
-            int val = 0;
-            int bucket = 0;
 
             long word0 = UNSAFE.getLong(addr);
             long semipos_code = getSemiPosCode(word0);
@@ -191,9 +202,9 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
                 int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 addr += (dot_pos >>> 3) + 3;
 
-                tail = (word0 & HASH_MASKS[semi_pos]);
-                bucket = xxh32(tail) & BUCKET_MASK;
-                val = parseNum(num_word, dot_pos);
+                long tail = (word0 & HASH_MASKS[semi_pos]);
+                int bucket = xxh32(tail) & BUCKET_MASK;
+                short val = parseNum(num_word, dot_pos);
 
                 while (true) {
                     var node = map[bucket];
@@ -207,7 +218,7 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
                     }
                     bucket++;
                     if (SHOW_ANALYSIS)
-                        cls[thread_id]++;
+                        cls++;
                 }
                 continue;
             }
@@ -225,15 +236,15 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
                 int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 addr += (dot_pos >>> 3) + 4;
 
-                tail = (word & HASH_MASKS[semi_pos]);
+                long tail = (word & HASH_MASKS[semi_pos]);
                 hash ^= tail;
-                bucket = xxh32(hash) & BUCKET_MASK;
-                val = parseNum(num_word, dot_pos);
+                int bucket = xxh32(hash) & BUCKET_MASK;
+                short val = parseNum(num_word, dot_pos);
 
                 while (true) {
                     var node = map[bucket];
                     if (node == null) {
-                        map[bucket] = new Node(row_addr, tail, val, keylen, word0);
+                        map[bucket] = new Node(row_addr, word0, tail, val, keylen);
                         break;
                     }
                     if (node.word0 == word0 && node.tail == tail) {
@@ -242,7 +253,7 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
                     }
                     bucket++;
                     if (SHOW_ANALYSIS)
-                        cls[thread_id]++;
+                        cls++;
                 }
                 continue;
             }
@@ -261,30 +272,55 @@ static final Node[] parse(int thread_id, long start, long end, int[] cls) {
             int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
             addr += (dot_pos >>> 3) + 4;
 
-            tail = (word & HASH_MASKS[semi_pos]);
+            long tail = (word & HASH_MASKS[semi_pos]);
             hash ^= tail;
-            bucket = xxh32(hash) & BUCKET_MASK;
-            val = parseNum(num_word, dot_pos);
+            int bucket = xxh32(hash) & BUCKET_MASK;
+            short val = parseNum(num_word, dot_pos);
 
             while (true) {
                 var node = map[bucket];
                 if (node == null) {
-                    map[bucket] = new Node(row_addr, tail, val, keylen);
+                    map[bucket] = new Node(row_addr, word0, tail, val, keylen);
                     break;
                 }
-                if (node.contentEquals(row_addr, tail)) {
+                if (node.contentEquals(row_addr, word0, tail)) {
                     node.add(val);
                     break;
                 }
                 bucket++;
                 if (SHOW_ANALYSIS)
-                    cls[thread_id]++;
+                    cls++;
             }
         }
+        if (SHOW_ANALYSIS) {
+            debug("Thread %d collision = %d", thread_id, cls);
+        }
         return map;
     }
 
+    // thomaswue trick
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .inheritIO()
+                .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
     public static void main(String[] args) throws InterruptedException, IOException {
+        // thomaswue trick
+        if (args.length == 0 || !("--worker".equals(args[0]))) {
+            spawnWorker();
+            return;
+        }
+
         try (var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
             long start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
             long file_size = file.size();
@@ -295,51 +331,48 @@ public static void main(String[] args) throws InterruptedException, IOException
             long chunk_size = Math.ceilDiv(file_size, cpu_cnt);
 
             // processing
-            var threads = new Thread[cpu_cnt];
-            var maps = new Node[cpu_cnt][];
             var ptrs = slice(start_addr, end_addr, chunk_size, cpu_cnt);
 
-            int[] cls = new int[cpu_cnt]; // collision
+            TreeMap<String, Node> ms = new TreeMap<>();
             int[] lenhist = new int[64]; // length histogram
 
-            for (int i = 0; i < cpu_cnt; i++) {
-                int thread_id = i;
-                (threads[thread_id] = new Thread(() -> {
-                    maps[thread_id] = parse(thread_id, ptrs[thread_id], ptrs[thread_id + 1], cls);
-                })).start();
-            }
-
-            // join all
-            for (var thread : threads)
-                thread.join();
-
-            // collect results
-            TreeMap<String, Node> ms = new TreeMap<>();
-            for (var map : maps) {
-                for (var node : map) {
-                    if (node == null)
-                        continue;
+            List<List<Node>> maps = IntStream.range(0, cpu_cnt)
+                    .mapToObj(thread_id -> parse(thread_id, ptrs[thread_id], ptrs[thread_id + 1]))
+                    .map(map -> {
+                        List<Node> nodes = new ArrayList<>();
+                        for (var node : map) {
+                            if (node == null)
+                                continue;
+                            node.calcKey();
+                            nodes.add(node);
+                        }
+                        return nodes;
+                    })
+                    .parallel()
+                    .toList();
+
+            for (var nodes : maps) {
+                for (var node : nodes) {
                     if (SHOW_ANALYSIS) {
                         int kl = node.keylen & (lenhist.length - 1);
                         lenhist[kl] += node.count;
                     }
-                    var stat = ms.putIfAbsent(node.key(), node);
+                    var stat = ms.putIfAbsent(node.key, node);
                     if (stat != null)
                         stat.merge(node);
                 }
             }
 
             if (SHOW_ANALYSIS) {
-                debug("Collision stat: ");
-                for (int i = 0; i < cpu_cnt; i++) {
-                    debug("thread-" + i + " collision = " + cls[i]);
-                }
                 debug("Total = " + Arrays.stream(lenhist).sum());
                 debug("Length_histogram = "
                         + Arrays.toString(Arrays.stream(lenhist).map(x -> (int) (x * 1.0e-7)).toArray()));
+                return;
             }
-            else
-                System.out.println(ms);
+
+            // print result
+            System.out.println(ms);
+            System.out.close();
         }
     }
 }
\ No newline at end of file

From 95c0fb06b0177fb286fa29a594401b9c2c347433 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Tue, 23 Jan 2024 18:16:12 +0100
Subject: [PATCH 105/268] Leaderboard update

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 3ff82220b..c432fc7dd 100644
--- a/README.md
+++ b/README.md
@@ -43,18 +43,18 @@ These are the results from running all entries into the challenge on eight cores
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 | 2 | 00:02.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.305 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 | 3* | 00:02.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
+|   | 00:03.854 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
-|   | 00:03.990 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
 |   | 00:04.551 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | uses Unsafe |
@@ -72,10 +72,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |
 |   | 00:06.415 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) | uses Unsafe |
-|   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) | uses Unsafe |
+|   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
+|   | 00:06.670 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
-|   | 00:06.872 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
@@ -115,6 +115,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
+|   | 00:13.498 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java)| 21.0.1-tem | [Antonio Muñoz](https://github.com/tonivade) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |

From 6cf6da0c17510e9fc5b057530506f83114429b87 Mon Sep 17 00:00:00 2001
From: Rene Schwietzke <r.schwietzke@xceptance.com>
Date: Tue, 23 Jan 2024 18:21:36 +0100
Subject: [PATCH 106/268] Added environment docs (#555)

* Create ENVIRONMENT.md

* More intel added
---
 ENVIRONMENT.md | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 ENVIRONMENT.md

diff --git a/ENVIRONMENT.md b/ENVIRONMENT.md
new file mode 100644
index 000000000..5df4e69fe
--- /dev/null
+++ b/ENVIRONMENT.md
@@ -0,0 +1,92 @@
+# Environment
+This file just contains some intel about the environment in use and what has been done to get it into that state.
+
+## Machine Type
+
+* Hetzner AX161, Dedicated Hosted Hardware
+* CPU: AMD EPYC 7502P 32 cores / 64 threads @ 2.5 GHz
+* Memory: 128 GB ECC DDR4 RAM
+* 2x SAMSUNG MZQL2960HCJR-00A07, 1 TB, Software RAID-1
+* CentOS 9, Linux 5.14.0-378.el9.x86_64
+
+## Configuration
+
+* SMT off
+* Turbo Boost Off
+* Filesystem EXT4 
+
+## Details
+
+### CPU
+``` 
+$ cat /proc/cpuinfo 
+processor	: 0
+vendor_id	: AuthenticAMD
+cpu family	: 23
+model		: 49
+model name	: AMD EPYC 7502P 32-Core Processor
+stepping	: 0
+microcode	: 0x8301055
+cpu MHz		: 2500.000
+cache size	: 512 KB
+physical id	: 0
+siblings	: 32
+core id		: 0
+cpu cores	: 32
+apicid		: 0
+initial apicid	: 0
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 16
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es
+bugs		: sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass retbleed smt_rsb
+bogomips	: 4990.70
+TLB size	: 3072 4K pages
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 43 bits physical, 48 bits virtual
+power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
+... more for all other cores
+```
+
+## Setup
+
+### Turn SMT off
+Disable during boot via boot-param, able to switch it on later again, if needed.
+
+Add `nosmt` to grub boot config in `/etc/default/grub` 
+
+```
+# Added nosmt to command line
+GRUB_CMDLINE_LINUX="biosdevname=0 crashkernel=auto rd.auto=1 consoleblank=0 nosmt"
+```
+
+Update boot config:
+``` 
+sudo grub2-mkconfig -o /boot/grub2/grub.cfg
+```
+
+### Turbo Off
+Using the legacy `/etc/rc.local` concept to change things during boot:
+
+```
+# Turn SMT off via software as well, already got nosmt in grub
+echo off >  /sys/devices/system/cpu/smt/control
+
+# Turn off turbo boost
+echo 0 |tee /sys/devices/system/cpu/cpufreq/boost
+```
+### Reduce Swapping
+Reduce from default 60 to 10% memory pressure by adding `vm.swappiness = 10` to `/etc/sysctl.conf`.
+
+## Verify
+Check after boot if all settings have been applied. Can also be used to control these during runtime.
+
+* SMT off: `cat /sys/devices/system/cpu/smt/active` must be 0
+* SWAP: `cat /proc/sys/vm/swappiness` must be 10
+* Turbo off: `cat /sys/devices/system/cpu/cpufreq/boost` must be 0
+
+
+
+

From 2be84e09ee61aa074084779a25d69c69c8ab6aef Mon Sep 17 00:00:00 2001
From: Marek Kohn <me@makohn.de>
Date: Tue, 23 Jan 2024 18:26:53 +0100
Subject: [PATCH 107/268] Add 1brc solution by @makohn (#544)

---
 calculate_average_makohn.sh                   |  19 ++
 .../onebrc/CalculateAverage_makohn.java       | 287 ++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100755 calculate_average_makohn.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java

diff --git a/calculate_average_makohn.sh b/calculate_average_makohn.sh
new file mode 100755
index 000000000..092bae1c5
--- /dev/null
+++ b/calculate_average_makohn.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_makohn
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java b/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java
new file mode 100644
index 000000000..7b1a08057
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java
@@ -0,0 +1,287 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.stream.Collectors;
+
+//
+// This implementation is partially inspired by
+//
+// - GavinRay97: 1BRC in Kotlin (memory mapping, chunking) | https://github.com/gunnarmorling/1brc/discussions/154
+// - dannyvankooten: 1BRC in C (integer parsing, linear probing) | https://github.com/gunnarmorling/1brc/discussions/46
+//
+public class CalculateAverage_makohn {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static class Measurement implements Comparable<Measurement> {
+        final String city;
+        int min;
+        int max;
+        int count = 1;
+        int sum;
+
+        Measurement(String city, int val) {
+            this.city = city;
+            this.min = val;
+            this.max = val;
+            this.sum = val;
+        }
+
+        @Override
+        public String toString() {
+            return STR."\{city}=\{round(min)}/\{round((1.0 * sum) / count)}/\{round(max)}";
+        }
+
+        private double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+
+        @Override
+        public int compareTo(Measurement other) {
+            return this.city.compareTo(other.city);
+        }
+    }
+
+    // Convert a given byte array of temperature data to an int value
+    // Since the temperate values only have one decimal, we can use integer arithmetic until the end
+    //
+    // buffer: [..., '-', '1', '9', '.', '7', ...]
+    // -------------> offset
+    // ............ = s
+    //
+    // We initialize a "pointer" s with the offset. Depending on whether the first char is a '-' or not, we set the
+    // sign and increment the pointer.
+    //
+    // Then we only have to distinguish between one-digit and two-digit numbers.
+    // Depending on that, we set an index for the respective parts of the number.
+    //
+    private static int toInt(byte[] in, int offset) {
+        int sign = 1;
+        int s = offset;
+        if (in[s] == '-') {
+            sign = -1;
+            s++;
+        }
+
+        if (in[s + 1] == '.')
+            return sign * ((in[s] - '0') * 10 + (in[s + 2] - '0'));
+
+        return sign * ((in[s] - '0') * 100 + (in[s + 1] - '0') * 10 + (in[s + 3] - '0'));
+    }
+
+    // 10_000 distinct station names as per specification
+    // We use the next power of two (2^14 = 16384) to allow for bit-masking our hash (instead of using modulo)
+    private static final int MAX_STATIONS = 2 << 14;
+
+    // Twice as big as the maximum number of stations
+    private static final int MAP_CAPACITY = MAX_STATIONS * 2;
+
+    // We start at 1 to allow for checking our hash-index map for > 0
+    private static final int RES_FIRST_INDEX = 1;
+
+    private static class ResultMap {
+        final int[] map = new int[MAP_CAPACITY]; // hash -> index
+        final Measurement[] measurements = new Measurement[MAX_STATIONS]; // index -> measurement
+        private int lastIndex = 0;
+
+        private void put(int hash, Measurement measurement) {
+            lastIndex++;
+            measurements[lastIndex] = measurement;
+            map[hash] = lastIndex;
+        }
+
+        private boolean contains(int hash) {
+            return map[hash] > 0;
+        }
+
+        private Measurement get(int hash) {
+            return measurements[map[hash]];
+        }
+    }
+
+    // We use linear probing as our hash-collision strategy
+    //
+    // We use MAP_CAPACITY - 1 as a bitmask to force the hash to be lower than our capacity
+    // Let's consider a hash 16390. If our capacity is 2^14 = 16384, the hash is out of bounds.
+    //
+    // 16390 : 100000000000110
+    // 16383 : 011111111111111
+    // ....... 000000000000110 = 3
+    private static int linearProbe(ResultMap res, String key) {
+        var hash = key.hashCode() & (MAP_CAPACITY - 1);
+        while (res.map[hash] > 0 && !(res.measurements[res.map[hash]].city.equals(key))) {
+            hash = (hash + 1) & (MAP_CAPACITY - 1);
+        }
+        return hash;
+    }
+
+    // Custom Quicksort implementation, seems to be slightly faster than Arrays.sort
+    private static void quickSort(Measurement[] arr, int begin, int end) {
+        if (begin < end) {
+            final var partitionIndex = partition(arr, begin, end);
+
+            quickSort(arr, begin, partitionIndex - 1);
+            quickSort(arr, partitionIndex + 1, end);
+        }
+    }
+
+    private static int partition(Measurement[] arr, int begin, int end) {
+        final var pivot = arr[end];
+        int i = (begin - 1);
+
+        for (int j = begin; j < end; j++) {
+            if (arr[j].compareTo(pivot) <= 0) {
+                i++;
+                final var tmp = arr[i];
+                arr[i] = arr[j];
+                arr[j] = tmp;
+            }
+        }
+
+        final var tmp = arr[i + 1];
+        arr[i + 1] = arr[end];
+        arr[end] = tmp;
+
+        return i + 1;
+    }
+
+    private static Collection<ByteBuffer> getChunks(MemorySegment memory, long chunkSize, long fileSize) {
+        final var chunks = new ArrayList<ByteBuffer>();
+        var chunkStart = 0L;
+        var chunkEnd = 0L;
+        while (chunkStart < fileSize) {
+            chunkEnd = Math.min((chunkStart + chunkSize), fileSize);
+            // starting from the calculated chunkEnd, seek the next newline to get the real chunkEnd
+            while (chunkEnd < fileSize && (memory.getAtIndex(ValueLayout.JAVA_BYTE, chunkEnd) & 0xFF) != '\n')
+                chunkEnd++;
+            // we have found our chunk boundaries, add a slice of memory with these boundaries to our list of chunks
+            if (chunkEnd < fileSize)
+                chunks.add(memory.asSlice(chunkStart, chunkEnd - chunkStart + 1).asByteBuffer());
+            else
+                // special case: we are at the end of the file
+                chunks.add(memory.asSlice(chunkStart, chunkEnd - chunkStart).asByteBuffer());
+
+            // next chunk
+            chunkStart = chunkEnd + 1;
+        }
+        return chunks;
+    }
+
+    // Station name: <= 100 bytes
+    // Temperature: <= 5 bytes
+    //
+    // Semicolon and new line are ignored
+    private static final int MAX_BYTES_PER_ROW = 105;
+
+    private static ResultMap processChunk(ByteBuffer chunk) {
+        final var map = new ResultMap();
+        final var buffer = new byte[MAX_BYTES_PER_ROW];
+        var i = 0;
+        var delimiter = 0;
+        // Process the chunk byte by byte and store each line in buffer
+        while (chunk.hasRemaining()) {
+            final var c = chunk.get();
+            // System.out.println((char) (c & 0xFF));
+            switch (c & 0xFF) {
+                // Memorize the position of the semicolon, such that we can divide the buffer afterward
+                case ';' -> delimiter = i;
+                // If we encounter newline, we can do the actual calculations for the current line
+                case '\n' -> {
+                    final var key = new String(buffer, 0, delimiter, StandardCharsets.UTF_8);
+                    final var value = toInt(buffer, delimiter);
+                    final var hash = linearProbe(map, key);
+                    if (map.contains(hash)) {
+                        final var current = map.get(hash);
+                        current.min = Math.min(current.min, value);
+                        current.max = Math.max(current.max, value);
+                        current.count++;
+                        current.sum += value;
+                    }
+                    else {
+                        map.put(hash, new Measurement(key, value));
+                    }
+                    i = 0;
+                    delimiter = 0;
+                }
+                default -> {
+                    buffer[i] = c;
+                    i++;
+                }
+            }
+        }
+        return map;
+    }
+
+    // File size is approximately 13 GB, ByteBuffer has a 2 GB limit
+    // Chunks should have a maximum size of approximately 13 GB / 8 = 1.625 GB
+    private static final int MIN_NUMBER_THREADS = 8;
+
+    public static void main(String[] args) throws Exception {
+        final var numProcessors = Math.max(Runtime.getRuntime().availableProcessors(), MIN_NUMBER_THREADS);
+        // memory-map the input file
+        try (final var channel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {
+            final var fileSize = channel.size();
+            final var chunkSize = (fileSize / numProcessors);
+            final var mappedMemory = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
+            // process the mapped data concurrently in chunks. Each chunk is processed on a dedicated thread
+            final var chunks = getChunks(mappedMemory, chunkSize, fileSize);
+            final var processed = chunks
+                    .parallelStream()
+                    .map(CalculateAverage_makohn::processChunk)
+                    .collect(Collectors.toList()); // materialize and thus synchronize
+            // merge the results, we can initialize with the first result, to avoid redundant probing
+            final var first = processed.removeFirst();
+            final var res = processed
+                    .stream()
+                    .reduce(first, (acc, partial) -> {
+                        for (int i = RES_FIRST_INDEX; i <= partial.lastIndex; i++) {
+                            final var value = partial.measurements[i];
+                            final var hash = linearProbe(acc, value.city);
+                            if (acc.contains(hash)) {
+                                final var cur = acc.get(hash);
+                                cur.min = Math.min(cur.min, value.min);
+                                cur.max = Math.max(cur.max, value.max);
+                                cur.count += value.count;
+                                cur.sum += value.sum;
+                            }
+                            else {
+                                acc.put(hash, value);
+                            }
+                        }
+                        return acc;
+                    });
+
+            quickSort(res.measurements, RES_FIRST_INDEX, res.lastIndex);
+            final var sb = new StringBuilder("{");
+            for (int i = RES_FIRST_INDEX; i < res.lastIndex; i++) {
+                sb.append(res.measurements[i]).append(',').append(' ');
+            }
+            sb.append(res.measurements[res.lastIndex]).append('}');
+            System.out.println(sb);
+        }
+    }
+}

From 2c432abb964db2a2556a06ff00e12221e4f58995 Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Tue, 23 Jan 2024 18:29:22 +0100
Subject: [PATCH 108/268] jerrinot's improvement - fast-path for short keys
 (#545)

* fast-path for keys<16 bytes

* fix off by one error

the mask is wrong for he 2nd word when len == 16

* less chunks per thread

seems like compact code wins. on my test box anyway.
---
 .../onebrc/CalculateAverage_jerrinot.java     | 433 +++++++++++-------
 1 file changed, 273 insertions(+), 160 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index 13e48ae05..2492c0fbd 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -38,7 +38,7 @@ public class CalculateAverage_jerrinot {
     // todo: with hyper-threading enable we would be better of with availableProcessors / 2;
     // todo: validate the testing env. params.
     private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
-    // private static final int THREAD_COUNT = 1;
+    // private static final int THREAD_COUNT = 4;
 
     private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
 
@@ -61,7 +61,7 @@ static void calculate() throws Exception {
         final File file = new File(MEASUREMENTS_TXT);
         final long length = file.length();
         // final int chunkCount = Runtime.getRuntime().availableProcessors();
-        int chunkPerThread = 4;
+        int chunkPerThread = 3;
         final int chunkCount = THREAD_COUNT * chunkPerThread;
         final var chunkStartOffsets = new long[chunkCount + 1];
         try (var raf = new RandomAccessFile(file, "r")) {
@@ -88,10 +88,8 @@ static void calculate() throws Exception {
                 long endB = chunkStartOffsets[i * chunkPerThread + 2];
                 long startC = chunkStartOffsets[i * chunkPerThread + 2];
                 long endC = chunkStartOffsets[i * chunkPerThread + 3];
-                long startD = chunkStartOffsets[i * chunkPerThread + 3];
-                long endD = chunkStartOffsets[i * chunkPerThread + 4];
 
-                Processor processor = new Processor(startA, endA, startB, endB, startC, endC, startD, endD);
+                Processor processor = new Processor(startA, endA, startB, endB, startC, endC);
                 processors[i] = processor;
                 Thread thread = new Thread(processor);
                 threads[i] = thread;
@@ -105,9 +103,7 @@ static void calculate() throws Exception {
             long endB = chunkStartOffsets[ownIndex * chunkPerThread + 2];
             long startC = chunkStartOffsets[ownIndex * chunkPerThread + 2];
             long endC = chunkStartOffsets[ownIndex * chunkPerThread + 3];
-            long startD = chunkStartOffsets[ownIndex * chunkPerThread + 3];
-            long endD = chunkStartOffsets[ownIndex * chunkPerThread + 4];
-            Processor processor = new Processor(startA, endA, startB, endB, startC, endC, startD, endD);
+            Processor processor = new Processor(startA, endA, startB, endB, startC, endC);
             processor.run();
 
             var accumulator = new TreeMap<String, Processor.StationStats>();
@@ -119,27 +115,31 @@ static void calculate() throws Exception {
                 processors[i].accumulateStatus(accumulator);
             }
 
-            var sb = new StringBuilder();
-            boolean first = true;
-            for (Map.Entry<String, Processor.StationStats> statsEntry : accumulator.entrySet()) {
-                if (first) {
-                    sb.append("{");
-                    first = false;
-                }
-                else {
-                    sb.append(", ");
-                }
-                var value = statsEntry.getValue();
-                var name = statsEntry.getKey();
-                int min = value.min;
-                int max = value.max;
-                int count = value.count;
-                long sum2 = value.sum;
-                sb.append(String.format("%s=%.1f/%.1f/%.1f", name, min / 10.0, Math.round((double) sum2 / count) / 10.0, max / 10.0));
+            printResults(accumulator);
+        }
+    }
+
+    private static void printResults(TreeMap<String, Processor.StationStats> accumulator) {
+        var sb = new StringBuilder(10000);
+        boolean first = true;
+        for (Map.Entry<String, Processor.StationStats> statsEntry : accumulator.entrySet()) {
+            if (first) {
+                sb.append("{");
+                first = false;
             }
-            System.out.print(sb);
-            System.out.println('}');
+            else {
+                sb.append(", ");
+            }
+            var value = statsEntry.getValue();
+            var name = statsEntry.getKey();
+            int min = value.min;
+            int max = value.max;
+            int count = value.count;
+            long sum2 = value.sum;
+            sb.append(String.format("%s=%.1f/%.1f/%.1f", name, min / 10.0, Math.round((double) sum2 / count) / 10.0, max / 10.0));
         }
+        sb.append('}');
+        System.out.println(sb);
     }
 
     public static int ceilPow2(int i) {
@@ -154,51 +154,65 @@ public static int ceilPow2(int i) {
 
     private static class Processor implements Runnable {
         private static final int MAX_UNIQUE_KEYS = 10000;
-        private static final int MAP_SLOT_COUNT = ceilPow2(MAX_UNIQUE_KEYS);
+        private static final int MAPS_SLOT_COUNT = ceilPow2(MAX_UNIQUE_KEYS);
         private static final int STATION_MAX_NAME_BYTES = 104;
 
-        private static final long COUNT_OFFSET = 0;
-        private static final long MIN_OFFSET = 4;
-        private static final long MAX_OFFSET = 8;
-        private static final long SUM_OFFSET = 12;
-        private static final long LEN_OFFSET = 20;
-        private static final long NAME_OFFSET = 24;
+        private static final long MAP_COUNT_OFFSET = 0;
+        private static final long MAP_MIN_OFFSET = 4;
+        private static final long MAP_MAX_OFFSET = 8;
+        private static final long MAP_SUM_OFFSET = 12;
+        private static final long MAP_LEN_OFFSET = 20;
+        private static final long SLOW_MAP_NAME_OFFSET = 24;
+
+        // private int longestChain = 0;
 
-        private static final int MAP_ENTRY_SIZE_BYTES = Integer.BYTES // count // 0
+        private static final int SLOW_MAP_ENTRY_SIZE_BYTES = Integer.BYTES // count // 0
                 + Integer.BYTES // min // +4
                 + Integer.BYTES // max // +8
                 + Long.BYTES // sum // +12
                 + Integer.BYTES // station name len // +20
                 + Long.BYTES; // station name ptr // 24
 
-        private static final int MAP_SIZE_BYTES = MAP_SLOT_COUNT * MAP_ENTRY_SIZE_BYTES;
-        private static final int MAP_NAMES_BYTES = MAX_UNIQUE_KEYS * STATION_MAX_NAME_BYTES;
-        private static final long MAP_MASK = MAP_SLOT_COUNT - 1;
+        private static final long FAST_MAP_NAME_PART1 = 24;
+        private static final long FAST_MAP_NAME_PART2 = 32;
 
-        private final long map;
-        private long currentNamesPtr;
-        private final long namesHi;
+        private static final int FAST_MAP_ENTRY_SIZE_BYTES = Integer.BYTES // count // 0
+                + Integer.BYTES // min // +4
+                + Integer.BYTES // max // +8
+                + Long.BYTES // sum // +12
+                + Integer.BYTES // station name len // +20
+                + Long.BYTES // station name part 1 // 24
+                + Long.BYTES; // station name part 2 // 32
+
+        private static final int SLOW_MAP_SIZE_BYTES = MAPS_SLOT_COUNT * SLOW_MAP_ENTRY_SIZE_BYTES;
+        private static final int FAST_MAP_SIZE_BYTES = MAPS_SLOT_COUNT * FAST_MAP_ENTRY_SIZE_BYTES;
+        private static final int SLOW_MAP_MAP_NAMES_BYTES = MAX_UNIQUE_KEYS * STATION_MAX_NAME_BYTES;
+        private static final long MAP_MASK = MAPS_SLOT_COUNT - 1;
+
+        private long slowMap;
+        private long slowMapNamesPtr;
+        private long slowMapNamesLo;
+        private long fastMap;
         private long cursorA;
         private long endA;
         private long cursorB;
         private long endB;
         private long cursorC;
         private long endC;
-        private long cursorD;
-        private long endD;
+        private HashMap<String, StationStats> stats = new HashMap<>(1000);
 
         // private long maxClusterLen;
 
         // credit: merykitty
         private long parseAndStoreTemperature(long startCursor, long baseEntryPtr, long word) {
             // long word = UNSAFE.getLong(startCursor);
-            long countPtr = baseEntryPtr + COUNT_OFFSET;
+            long countPtr = baseEntryPtr + MAP_COUNT_OFFSET;
             int cnt = UNSAFE.getInt(countPtr);
             UNSAFE.putInt(countPtr, cnt + 1);
 
-            long minPtr = baseEntryPtr + MIN_OFFSET;
-            long maxPtr = baseEntryPtr + MAX_OFFSET;
-            long sumPtr = baseEntryPtr + SUM_OFFSET;
+            long minPtr = baseEntryPtr + MAP_MIN_OFFSET;
+            long maxPtr = baseEntryPtr + MAP_MAX_OFFSET;
+            long sumPtr = baseEntryPtr + MAP_SUM_OFFSET;
 
             int min = UNSAFE.getInt(minPtr);
             int max = UNSAFE.getInt(maxPtr);
@@ -232,69 +246,90 @@ private static long getDelimiterMask(final long word) {
         // todo: immutability cost us in allocations, but that's probably peanuts in the grand scheme of things. still worth checking
         // maybe JVM trusting Final in Records offsets it ..a test is needed
         record StationStats(int min, int max, int count, long sum) {
+            StationStats mergeWith(StationStats other) {
+                return new StationStats(Math.min(min, other.min), Math.max(max, other.max), count + other.count, sum + other.sum);
+            }
         }
 
         void accumulateStatus(TreeMap<String, StationStats> accumulator) {
-            for (long baseAddress = map; baseAddress < map + MAP_SIZE_BYTES; baseAddress += MAP_ENTRY_SIZE_BYTES) {
-                long len = UNSAFE.getInt(baseAddress + LEN_OFFSET);
-                if (len == 0) {
-                    continue;
-                }
-                byte[] nameArr = new byte[(int) len];
-                long baseNameAddr = UNSAFE.getLong(baseAddress + NAME_OFFSET);
-                for (int i = 0; i < len; i++) {
-                    nameArr[i] = UNSAFE.getByte(baseNameAddr + i);
-                }
-                String name = new String(nameArr);
-                int min = UNSAFE.getInt(baseAddress + MIN_OFFSET);
-                int max = UNSAFE.getInt(baseAddress + MAX_OFFSET);
-                int count = UNSAFE.getInt(baseAddress + COUNT_OFFSET);
-                long sum = UNSAFE.getLong(baseAddress + SUM_OFFSET);
+            for (Map.Entry<String, StationStats> entry : stats.entrySet()) {
+                String name = entry.getKey();
+                StationStats localStats = entry.getValue();
 
-                var v = accumulator.get(name);
-                if (v == null) {
-                    accumulator.put(name, new StationStats(min, max, count, sum));
+                StationStats globalStats = accumulator.get(name);
+                if (globalStats == null) {
+                    accumulator.put(name, localStats);
                 }
                 else {
-                    accumulator.put(name, new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
+                    accumulator.put(name, globalStats.mergeWith(localStats));
                 }
             }
         }
 
-        Processor(long startA, long endA, long startB, long endB, long startC, long endC, long startD, long endD) {
+        Processor(long startA, long endA, long startB, long endB, long startC, long endC) {
             this.cursorA = startA;
             this.cursorB = startB;
             this.cursorC = startC;
-            this.cursorD = startD;
             this.endA = endA;
             this.endB = endB;
             this.endC = endC;
-            this.endD = endD;
-            this.map = UNSAFE.allocateMemory(MAP_SIZE_BYTES);
-            this.currentNamesPtr = UNSAFE.allocateMemory(MAP_NAMES_BYTES);
-            this.namesHi = currentNamesPtr + MAP_NAMES_BYTES;
-
-            int i;
-            for (i = 0; i < MAP_SIZE_BYTES; i += 8) {
-                UNSAFE.putLong(map + i, 0);
-            }
-            for (i = i - 8; i < MAP_SIZE_BYTES; i++) {
-                UNSAFE.putByte(map + i, (byte) 0);
-            }
-            UNSAFE.setMemory(currentNamesPtr, MAP_NAMES_BYTES, (byte) 0);
         }
 
         private void doTail() {
-            // todo: we would be probably better of without all that code dup. ("compilers hates him!")
-            // System.out.println("done ILP");
             doOne(cursorA, endA);
-            // System.out.println("done A");
             doOne(cursorB, endB);
-            // System.out.println("done B");
             doOne(cursorC, endC);
-            // System.out.println("done C");
-            doOne(cursorD, endD);
-            // System.out.println("done D");
+
+            transferToHeap();
+            UNSAFE.freeMemory(fastMap);
+            UNSAFE.freeMemory(slowMap);
+            UNSAFE.freeMemory(slowMapNamesLo);
+        }
+
+        private void transferToHeap() {
+            for (long baseAddress = slowMap; baseAddress < slowMap + SLOW_MAP_SIZE_BYTES; baseAddress += SLOW_MAP_ENTRY_SIZE_BYTES) {
+                long len = UNSAFE.getInt(baseAddress + MAP_LEN_OFFSET);
+                if (len == 0) {
+                    continue;
+                }
+                byte[] nameArr = new byte[(int) len];
+                long baseNameAddr = UNSAFE.getLong(baseAddress + SLOW_MAP_NAME_OFFSET);
+                for (int i = 0; i < len; i++) {
+                    nameArr[i] = UNSAFE.getByte(baseNameAddr + i);
+                }
+                String name = new String(nameArr);
+                int min = UNSAFE.getInt(baseAddress + MAP_MIN_OFFSET);
+                int max = UNSAFE.getInt(baseAddress + MAP_MAX_OFFSET);
+                int count = UNSAFE.getInt(baseAddress + MAP_COUNT_OFFSET);
+                long sum = UNSAFE.getLong(baseAddress + MAP_SUM_OFFSET);
+
+                stats.put(name, new StationStats(min, max, count, sum));
+            }
+
+            for (long baseAddress = fastMap; baseAddress < fastMap + FAST_MAP_SIZE_BYTES; baseAddress += FAST_MAP_ENTRY_SIZE_BYTES) {
+                long len = UNSAFE.getInt(baseAddress + MAP_LEN_OFFSET);
+                if (len == 0) {
+                    continue;
+                }
+                byte[] nameArr = new byte[(int) len];
+                long baseNameAddr = baseAddress + FAST_MAP_NAME_PART1;
+                for (int i = 0; i < len; i++) {
+                    nameArr[i] = UNSAFE.getByte(baseNameAddr + i);
+                }
+                String name = new String(nameArr);
+                int min = UNSAFE.getInt(baseAddress + MAP_MIN_OFFSET);
+                int max = UNSAFE.getInt(baseAddress + MAP_MAX_OFFSET);
+                int count = UNSAFE.getInt(baseAddress + MAP_COUNT_OFFSET);
+                long sum = UNSAFE.getLong(baseAddress + MAP_SUM_OFFSET);
+
+                var v = stats.get(name);
+                if (v == null) {
+                    stats.put(name, new StationStats(min, max, count, sum));
+                }
+                else {
+                    stats.put(name, new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
+                }
+            }
         }
 
         private void doOne(long cursor, long endA) {
@@ -302,7 +337,12 @@ private void doOne(long cursor, long endA) {
                 long start = cursor;
                 long currentWord = UNSAFE.getLong(cursor);
                 long mask = getDelimiterMask(currentWord);
-                long maskedFirstWord = currentWord & ((mask - 1) ^ mask) >>> 8;
+                long firstWordMask = ((mask - 1) ^ mask) >>> 8;
+                final long isMaskZeroA = ((mask | -mask) >>> 63) ^ 1;
+                long ext = -isMaskZeroA & 0xFF00_0000_0000_0000L;
+                firstWordMask |= ext;
+
+                long maskedFirstWord = currentWord & firstWordMask;
                 long hash = hash(maskedFirstWord);
                 while (mask == 0) {
                     cursor += 8;
@@ -312,7 +352,9 @@ private void doOne(long cursor, long endA) {
                 final int delimiterByte = Long.numberOfTrailingZeros(mask);
                 final long semicolon = cursor + (delimiterByte >> 3);
                 final long maskedWord = currentWord & ((mask - 1) ^ mask) >>> 8;
-                long baseEntryPtr = getOrCreateEntryBaseOffset(semicolon, start, (int) hash, maskedWord);
+
+                long len = semicolon - start;
+                long baseEntryPtr = getOrCreateEntryBaseOffsetSlow(len, start, (int) hash, maskedWord);
                 long temperatureWord = UNSAFE.getLong(semicolon + 1);
                 cursor = parseAndStoreTemperature(semicolon + 1, baseEntryPtr, temperatureWord);
             }
@@ -331,133 +373,204 @@ private static long hash(long word1) {
 
         @Override
         public void run() {
-            while (cursorA < endA && cursorB < endB && cursorC < endC && cursorD < endD) {
+            this.slowMap = UNSAFE.allocateMemory(SLOW_MAP_SIZE_BYTES);
+            this.slowMapNamesPtr = UNSAFE.allocateMemory(SLOW_MAP_MAP_NAMES_BYTES);
+            this.slowMapNamesLo = slowMapNamesPtr;
+            this.fastMap = UNSAFE.allocateMemory(FAST_MAP_SIZE_BYTES);
+            UNSAFE.setMemory(slowMap, SLOW_MAP_SIZE_BYTES, (byte) 0);
+            UNSAFE.setMemory(fastMap, FAST_MAP_SIZE_BYTES, (byte) 0);
+            UNSAFE.setMemory(slowMapNamesPtr, SLOW_MAP_MAP_NAMES_BYTES, (byte) 0);
+
+            while (cursorA < endA && cursorB < endB && cursorC < endC) {
                 long startA = cursorA;
                 long startB = cursorB;
                 long startC = cursorC;
-                long startD = cursorD;
 
                 long currentWordA = UNSAFE.getLong(startA);
                 long currentWordB = UNSAFE.getLong(startB);
                 long currentWordC = UNSAFE.getLong(startC);
-                long currentWordD = UNSAFE.getLong(startD);
 
-                // credits for the hashing idea: mtopolnik
                 long maskA = getDelimiterMask(currentWordA);
-                long maskedFirstWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
+                long maskB = getDelimiterMask(currentWordB);
+                long maskC = getDelimiterMask(currentWordC);
+
+                long firstWordMaskA = (maskA ^ (maskA - 1)) >>> 8;
+                long firstWordMaskB = (maskB ^ (maskB - 1)) >>> 8;
+                long firstWordMaskC = (maskC ^ (maskC - 1)) >>> 8;
+
+                final long isMaskZeroA = ((maskA | -maskA) >>> 63) ^ 1;
+                final long isMaskZeroB = ((maskB | -maskB) >>> 63) ^ 1;
+                final long isMaskZeroC = ((maskC | -maskC) >>> 63) ^ 1;
+
+                long extA = -isMaskZeroA & 0xFF00_0000_0000_0000L;
+                long extB = -isMaskZeroB & 0xFF00_0000_0000_0000L;
+                long extC = -isMaskZeroC & 0xFF00_0000_0000_0000L;
+
+                firstWordMaskA |= extA;
+                firstWordMaskB |= extB;
+                firstWordMaskC |= extC;
+
+                long maskedFirstWordA = currentWordA & firstWordMaskA;
+                long maskedFirstWordB = currentWordB & firstWordMaskB;
+                long maskedFirstWordC = currentWordC & firstWordMaskC;
+
+                // assertMasks(isMaskZeroA, maskA);
+
                 long hashA = hash(maskedFirstWordA);
+                long hashB = hash(maskedFirstWordB);
+                long hashC = hash(maskedFirstWordC);
+
+                cursorA += isMaskZeroA * 8;
+                cursorB += isMaskZeroB * 8;
+                cursorC += isMaskZeroC * 8;
+
+                currentWordA = UNSAFE.getLong(cursorA);
+                currentWordB = UNSAFE.getLong(cursorB);
+                currentWordC = UNSAFE.getLong(cursorC);
+
+                maskA = getDelimiterMask(currentWordA);
                 while (maskA == 0) {
                     cursorA += 8;
                     currentWordA = UNSAFE.getLong(cursorA);
                     maskA = getDelimiterMask(currentWordA);
                 }
-                final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
-                final long semicolonA = cursorA + (delimiterByteA >> 3);
-                long temperatureWordA = UNSAFE.getLong(semicolonA + 1);
-                final long maskedWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
-
-                long maskB = getDelimiterMask(currentWordB);
-                long maskedFirstWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
-                long hashB = hash(maskedFirstWordB);
+                maskB = getDelimiterMask(currentWordB);
                 while (maskB == 0) {
                     cursorB += 8;
                     currentWordB = UNSAFE.getLong(cursorB);
                     maskB = getDelimiterMask(currentWordB);
                 }
-                final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
-                final long semicolonB = cursorB + (delimiterByteB >> 3);
-                long temperatureWordB = UNSAFE.getLong(semicolonB + 1);
-                final long maskedWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
-
-                long maskC = getDelimiterMask(currentWordC);
-                long maskedFirstWordC = currentWordC & ((maskC - 1) ^ maskC) >>> 8;
-                long hashC = hash(maskedFirstWordC);
+                maskC = getDelimiterMask(currentWordC);
                 while (maskC == 0) {
                     cursorC += 8;
                     currentWordC = UNSAFE.getLong(cursorC);
                     maskC = getDelimiterMask(currentWordC);
                 }
+
+                final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
+                final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
                 final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
+
+                final long semicolonA = cursorA + (delimiterByteA >> 3);
+                final long semicolonB = cursorB + (delimiterByteB >> 3);
                 final long semicolonC = cursorC + (delimiterByteC >> 3);
-                long temperatureWordC = UNSAFE.getLong(semicolonC + 1);
+
+                long digitStartA = semicolonA + 1;
+                long digitStartB = semicolonB + 1;
+                long digitStartC = semicolonC + 1;
+                long temperatureWordA = UNSAFE.getLong(digitStartA);
+                long temperatureWordB = UNSAFE.getLong(digitStartB);
+                long temperatureWordC = UNSAFE.getLong(digitStartC);
+
+                final long maskedWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
+                final long maskedWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
                 final long maskedWordC = currentWordC & ((maskC - 1) ^ maskC) >>> 8;
 
-                long maskD = getDelimiterMask(currentWordD);
-                long maskedFirstWordD = currentWordD & ((maskD - 1) ^ maskD) >>> 8;
-                long hashD = hash(maskedFirstWordD);
-                while (maskD == 0) {
-                    cursorD += 8;
-                    currentWordD = UNSAFE.getLong(cursorD);
-                    maskD = getDelimiterMask(currentWordD);
+                long lenA = semicolonA - startA;
+                long lenB = semicolonB - startB;
+                long lenC = semicolonC - startC;
+
+                long baseEntryPtrA;
+                if (lenA > 15) {
+                    baseEntryPtrA = getOrCreateEntryBaseOffsetSlow(lenA, startA, (int) hashA, maskedWordA);
+                }
+                else {
+                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(lenA, (int) hashA, maskedWordA, maskedFirstWordA);
                 }
-                final int delimiterByteD = Long.numberOfTrailingZeros(maskD);
-                final long semicolonD = cursorD + (delimiterByteD >> 3);
-                long temperatureWordD = UNSAFE.getLong(semicolonD + 1);
-                final long maskedWordD = currentWordD & ((maskD - 1) ^ maskD) >>> 8;
-
-                long baseEntryPtrA = getOrCreateEntryBaseOffset(semicolonA, startA, (int) hashA, maskedWordA);
-                long baseEntryPtrB = getOrCreateEntryBaseOffset(semicolonB, startB, (int) hashB, maskedWordB);
-                long baseEntryPtrC = getOrCreateEntryBaseOffset(semicolonC, startC, (int) hashC, maskedWordC);
-                long baseEntryPtrD = getOrCreateEntryBaseOffset(semicolonD, startD, (int) hashD, maskedWordD);
-
-                cursorA = parseAndStoreTemperature(semicolonA + 1, baseEntryPtrA, temperatureWordA);
-                cursorB = parseAndStoreTemperature(semicolonB + 1, baseEntryPtrB, temperatureWordB);
-                cursorC = parseAndStoreTemperature(semicolonC + 1, baseEntryPtrC, temperatureWordC);
-                cursorD = parseAndStoreTemperature(semicolonD + 1, baseEntryPtrD, temperatureWordD);
+
+                long baseEntryPtrB;
+                if (lenB > 15) {
+                    baseEntryPtrB = getOrCreateEntryBaseOffsetSlow(lenB, startB, (int) hashB, maskedWordB);
+                }
+                else {
+                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(lenB, (int) hashB, maskedWordB, maskedFirstWordB);
+                }
+
+                long baseEntryPtrC;
+                if (lenC > 15) {
+                    baseEntryPtrC = getOrCreateEntryBaseOffsetSlow(lenC, startC, (int) hashC, maskedWordC);
+                }
+                else {
+                    baseEntryPtrC = getOrCreateEntryBaseOffsetFast(lenC, (int) hashC, maskedWordC, maskedFirstWordC);
+                }
+
+                cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
+                cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
+                cursorC = parseAndStoreTemperature(digitStartC, baseEntryPtrC, temperatureWordC);
             }
             doTail();
+            // System.out.println("Longest chain: " + longestChain);
         }
 
-        private long getOrCreateEntryBaseOffset(long semicolonPtr, long startPtr, int hash, long maskedWord) {
-            long lenLong = semicolonPtr - startPtr;
+        private long getOrCreateEntryBaseOffsetFast(long lenLong, int hash, long maskedLastWord, long maskedFirstWord) {
             int lenA = (int) lenLong;
-
             long mapIndexA = hash & MAP_MASK;
             for (;;) {
-                long basePtr = mapIndexA * MAP_ENTRY_SIZE_BYTES + map;
-                long lenPtr = basePtr + LEN_OFFSET;
-                long namePtr = basePtr + NAME_OFFSET;
+                long basePtr = mapIndexA * FAST_MAP_ENTRY_SIZE_BYTES + fastMap;
+                long lenPtr = basePtr + MAP_LEN_OFFSET;
                 int len = UNSAFE.getInt(lenPtr);
                 if (len == lenA) {
-                    namePtr = UNSAFE.getLong(basePtr + NAME_OFFSET);
-                    if (nameMatch(startPtr, maskedWord, namePtr, lenLong)) {
+                    long namePart1 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART1);
+                    long namePart2 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART2);
+                    if (namePart1 == maskedFirstWord && namePart2 == maskedLastWord) {
                         return basePtr;
                     }
                 }
                 else if (len == 0) {
-                    // todo: uncommon branch maybe?
-                    // empty slot
-                    UNSAFE.putLong(namePtr, currentNamesPtr);
                     UNSAFE.putInt(lenPtr, lenA);
                     // todo: this could be a single putLong()
-                    UNSAFE.putInt(basePtr + MAX_OFFSET, Integer.MIN_VALUE);
-                    UNSAFE.putInt(basePtr + MIN_OFFSET, Integer.MAX_VALUE);
-                    UNSAFE.copyMemory(startPtr, currentNamesPtr, lenA);
-                    long consume = (lenLong & ~7L) + 8;
-                    currentNamesPtr += consume;
-                    assert currentNamesPtr <= namesHi;
+                    UNSAFE.putInt(basePtr + MAP_MAX_OFFSET, Integer.MIN_VALUE);
+                    UNSAFE.putInt(basePtr + MAP_MIN_OFFSET, Integer.MAX_VALUE);
+                    UNSAFE.putLong(basePtr + FAST_MAP_NAME_PART1, maskedFirstWord);
+                    UNSAFE.putLong(basePtr + FAST_MAP_NAME_PART2, maskedLastWord);
                     return basePtr;
                 }
                 mapIndexA = ++mapIndexA & MAP_MASK;
             }
         }
 
-        private static boolean nameMatch(long startA, long maskedWordA, long namePtr, long len) {
-            // long namePtr = basePtr + NAME_OFFSET;
-            long fullLen = len & ~7L;
-            long offset;
+        private long getOrCreateEntryBaseOffsetSlow(long lenLong, long startPtr, int hash, long maskedLastWord) {
+            long fullLen = lenLong & ~7L;
+            int lenA = (int) lenLong;
+            long mapIndexA = hash & MAP_MASK;
+            for (;;) {
+                long basePtr = mapIndexA * SLOW_MAP_ENTRY_SIZE_BYTES + slowMap;
+                long lenPtr = basePtr + MAP_LEN_OFFSET;
+                long namePtr = basePtr + SLOW_MAP_NAME_OFFSET;
+                int len = UNSAFE.getInt(lenPtr);
+                if (len == lenA) {
+                    namePtr = UNSAFE.getLong(basePtr + SLOW_MAP_NAME_OFFSET);
+                    if (nameMatch(startPtr, maskedLastWord, namePtr, fullLen)) {
+                        return basePtr;
+                    }
+                }
+                else if (len == 0) {
+                    UNSAFE.putLong(namePtr, slowMapNamesPtr);
+                    UNSAFE.putInt(lenPtr, lenA);
+                    UNSAFE.putInt(basePtr + MAP_MAX_OFFSET, Integer.MIN_VALUE);
+                    UNSAFE.putInt(basePtr + MAP_MIN_OFFSET, Integer.MAX_VALUE);
+                    UNSAFE.copyMemory(startPtr, slowMapNamesPtr, lenA);
+                    long alignedLen = (lenLong & ~7L) + 8;
+                    slowMapNamesPtr += alignedLen;
+                    return basePtr;
+                }
+                mapIndexA = ++mapIndexA & MAP_MASK;
+            }
+        }
 
-            // todo: this is worth exploring further.
-            // @mtopolnik has an interesting algo with 2 unconditioned long loads: this is sufficient
-            // for majority of names. so we would be left with just a single branch which is almost never taken?
+        private static boolean nameMatch(long start, long maskedLastWord, long namePtr, long fullLen) {
+            return nameMatchSlow(start, namePtr, fullLen, maskedLastWord);
+        }
+
+        private static boolean nameMatchSlow(long start, long namePtr, long fullLen, long maskedLastWord) {
+            long offset;
             for (offset = 0; offset < fullLen; offset += 8) {
-                if (UNSAFE.getLong(startA + offset) != UNSAFE.getLong(namePtr + offset)) {
+                if (UNSAFE.getLong(start + offset) != UNSAFE.getLong(namePtr + offset)) {
                     return false;
                 }
             }
-
             long maskedWordInMap = UNSAFE.getLong(namePtr + fullLen);
-            return (maskedWordInMap == maskedWordA);
+            return (maskedWordInMap == maskedLastWord);
         }
     }
 

From 415b3eb5c5dbaf6f7228898e1f6b84fa366b373e Mon Sep 17 00:00:00 2001
From: Jeevjyot Singh Chhabda <jeevjyotchhabda@gmail.com>
Date: Tue, 23 Jan 2024 09:43:58 -0800
Subject: [PATCH 109/268] b1rc challenge by @jeevjyot (#551)

* b1rc challenge

* fixed a rounding error

* added the file back

* fixed file

* removed a file

---------

Co-authored-by: Jeevjyot Singh Chhabda <jeevjyotsinghchhabda@Jeevjyots-MBP.hsd1.ca.comcast.net>
---
 calculate_average_jeevjyot.sh                 |  19 ++++
 .../onebrc/CalculateAverage_jeevjyot.java     | 107 ++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100755 calculate_average_jeevjyot.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_jeevjyot.java

diff --git a/calculate_average_jeevjyot.sh b/calculate_average_jeevjyot.sh
new file mode 100755
index 000000000..215eeff14
--- /dev/null
+++ b/calculate_average_jeevjyot.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jeevjyot
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jeevjyot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jeevjyot.java
new file mode 100644
index 000000000..191e407bc
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jeevjyot.java
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import static java.lang.Math.round;
+import static java.util.stream.Collectors.*;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Collector;
+
+public class CalculateAverage_jeevjyot {
+
+    public static final String MEAUREMENT_FILE = "./measurements.txt";
+
+    public static void main(String[] args) throws IOException {
+        Map<String, tempMeasurement> result = new ConcurrentHashMap<>();
+        Files.lines(Path.of(MEAUREMENT_FILE))
+                .parallel()
+                .forEach(s -> {
+                    var separatorIndex = s.indexOf(";");
+                    var stationName = s.substring(0, separatorIndex);
+                    var temp = s.substring(separatorIndex + 1);
+                    result.computeIfAbsent(stationName, d -> new tempMeasurement(parseDoubleFast(temp)))
+                            .recordTemp(parseDoubleFast(temp));
+                });
+
+        TreeMap<String, tempMeasurement> sortedStats = new TreeMap<>(result);
+        System.out.println(sortedStats);
+    }
+
+    public static double parseDoubleFast(String str) {
+        // Simple implementation - can be improved with more error checking and support for different formats
+        boolean negative = false;
+        double result = 0;
+        int length = str.length();
+        int i = 0;
+        if (str.charAt(0) == '-') {
+            negative = true;
+            i++;
+        }
+        for (; i < length; i++) {
+            char c = str.charAt(i);
+            if (c == '.') {
+                int divisor = 1;
+                for (i++; i < length; i++) {
+                    result += (double) (str.charAt(i) - '0') / (divisor *= 10);
+                }
+                break;
+            }
+            result = result * 10 + (c - '0');
+        }
+        return negative ? -result : result;
+    }
+
+    private static double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+
+    public static class tempMeasurement {
+        double minTemp;
+        double maxTemp;
+        double sum;
+        int count;
+
+        public tempMeasurement(double temString) {
+            this.minTemp = temString;
+            this.maxTemp = temString;
+            this.sum = 0.0;
+            this.count = 0;
+        }
+
+        public synchronized void recordTemp(Double temp) {
+            this.minTemp = Math.min(minTemp, temp);
+            this.maxTemp = Math.max(maxTemp, temp);
+            sum += temp;
+            count++;
+        }
+
+        double getAverage() {
+            return round(sum) / count;
+        }
+
+        @Override
+        public String toString() {
+            return String.format("%.1f/%.1f/%.1f", round(minTemp), round(getAverage()), round(maxTemp));
+        }
+    }
+}

From c232346e876120b78790cbe73f0ffa891ef6f941 Mon Sep 17 00:00:00 2001
From: Gerd Aschemann <github@aschemann.net>
Date: Tue, 23 Jan 2024 19:13:13 +0100
Subject: [PATCH 110/268] #540 Cache SDKman (#554)

IMPORTANT: Only use SDKman provided Java - System JDK no longer installed!
---
 .github/workflows/maven.yml | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 9c0bc3b85..2014739f5 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -17,6 +17,8 @@
 name: Build
 
 on:
+  # Enable manual re-run
+  workflow_dispatch: { }
   push:
     branches: [ main ]
   pull_request:
@@ -32,11 +34,12 @@ jobs:
         with:
           submodules: 'true'
 
-      - name: 'Set up Java'
-        uses: actions/setup-java@v2
+      - name: Cache SDKMan
+        id: cache-sdkman
+        uses: actions/cache@v4
         with:
-          java-version: 21
-          distribution: 'temurin'
+          path: ~/.sdkman
+          key: ${{ runner.os }}-sdkman
 
       - name: 'Cache Maven packages'
         uses: actions/cache@v3
@@ -45,21 +48,20 @@ jobs:
           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
           restore-keys: ${{ runner.os }}-m2
 
-      #- name: Cache SDKMan
-      #  id: cache-sdkman
-      #  uses: actions/cache@v4
-      #  with:
-      #    path: ~/.sdkman
-      #    key: ${{ runner.os }}-sdkman
-
-      - name: 'Build project'
-        run: mvn -B clean verify -Pci
-
       - name: 'Setup SDKMAN'
         uses: sdkman/sdkman-action@b1f9b696c79148b66d3d3a06f7ea801820318d0f
         id: sdkman
-        if: github.event_name == 'pull_request'
+
+      - name: 'Build project'
+        run: |
+          source "$HOME/.sdkman/bin/sdkman-init.sh"
+          ./mvnw --version
+          ./mvnw -B clean verify -Pci
 
       - name: 'Test submission'
-        run: ./test_ci.sh ${{ github.event.pull_request.user.login }}
+        shell: bash
+        run: |
+          ./test_ci.sh ${{ github.event.pull_request.user.login }}
         if: github.event_name == 'pull_request'
+
+

From 46d2058bd45eeb7ac2f8071bd67e628676545ee6 Mon Sep 17 00:00:00 2001
From: Gaurav Anantrao Deshmukh <gauravdeshmukh42@gmail.com>
Date: Wed, 24 Jan 2024 00:02:03 +0530
Subject: [PATCH 111/268] First optimal solution attempt (#539)

* First optimal attempt

* Removing debug lines

* Using default string equals method

---------

Co-authored-by: Gaurav Deshmukh <deshmgau@amazon.com>
---
 calculate_average_gauravdeshmukh.sh           |  19 ++
 .../CalculateAverage_gauravdeshmukh.java      | 308 ++++++++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100755 calculate_average_gauravdeshmukh.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java

diff --git a/calculate_average_gauravdeshmukh.sh b/calculate_average_gauravdeshmukh.sh
new file mode 100755
index 000000000..4f941e4bd
--- /dev/null
+++ b/calculate_average_gauravdeshmukh.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gauravdeshmukh
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java
new file mode 100644
index 000000000..def75ecfe
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java
@@ -0,0 +1,308 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+public class CalculateAverage_gauravdeshmukh {
+
+    private static final String FILE = "./measurements.txt";
+    private static final byte NEGATIVE_SIGN_BYTE = 0x2D;
+    private static final byte DOT_BYTE = 0x2E;
+    private static final int SEARCH_SPACE_BUFFER_SIZE = 140;
+
+    private static final long SEMI_COLON_MASK = 0x3B3B3B3B3B3B3B3BL;
+    private static final long EOL_MASK = 0x0A0A0A0A0A0A0A0AL;
+
+    private static class ByteString {
+        final private String string;
+        final private int staticHashCode;
+
+        public ByteString(byte[] bytes) {
+            this.string = new String(bytes, StandardCharsets.UTF_8);
+            this.staticHashCode = this.string.hashCode();
+        }
+
+        public byte[] getBytes() {
+            return string.getBytes(StandardCharsets.UTF_8);
+        }
+
+        @Override
+        public boolean equals(Object bs) {
+            return this.string.equals(bs.toString());
+        }
+
+        @Override
+        public int hashCode() {
+            return staticHashCode;
+        }
+
+        @Override
+        public String toString() {
+            return this.string;
+        }
+    }
+
+    private static class Measurement {
+        public ByteString station;
+        public int value;
+
+        public Measurement(ByteString station, int value) {
+            this.station = station;
+            this.value = value;
+        }
+
+        @Override
+        public String toString() {
+            StringBuffer sb = new StringBuffer();
+            sb.append(station.toString());
+            sb.append(";");
+            sb.append(value);
+            return sb.toString();
+        }
+    }
+
+    private static class MeasurementAggregator {
+        private double min = Double.POSITIVE_INFINITY;
+        private double max = Double.NEGATIVE_INFINITY;
+        private int sum;
+        private long count;
+
+        public String toString() {
+            return round(min / 10.0) + "/" + round(sum * 1.0 / 10.0 / count) + "/" + round(max / 10.0);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        // long st = System.currentTimeMillis();
+        int cores = 1;
+
+        File file = new File(FILE);
+        long fileSize = file.length();
+        if (fileSize > 1048576) {
+            cores = Runtime.getRuntime().availableProcessors();
+        }
+        long chunkSize = fileSize / cores;
+
+        ExecutorService executorService = Executors.newFixedThreadPool(cores);
+        List<ParallelFileReaderTask> callableTasks = new ArrayList<>(cores);
+        RandomAccessFile raf = new RandomAccessFile(file, "r");
+        long end = chunkSize, start = 0;
+        for (int i = 0; i < cores; i++) {
+            if (i < cores - 1) {
+                MappedByteBuffer mbb = raf.getChannel().map(FileChannel.MapMode.READ_ONLY, end, Math.min(SEARCH_SPACE_BUFFER_SIZE, fileSize - end));
+                int eolIndex = -1;
+                int extraBytes = 0;
+                while (true) {
+                    long word;
+                    try {
+                        word = mbb.getLong();
+                    }
+                    catch (java.nio.BufferUnderflowException ex) {
+                        byte[] remainingBytes = ByteBuffer.allocate(8).putLong(0).array();
+                        mbb.get(mbb.position(), remainingBytes, 0, mbb.remaining());
+                        word = ByteBuffer.wrap(remainingBytes).getLong();
+                    }
+                    eolIndex = findEolInLong(word);
+                    if (eolIndex > -1) {
+                        extraBytes = extraBytes + eolIndex + 1;
+                        break;
+                    }
+                    extraBytes += 8;
+                }
+                end = end + extraBytes;
+            }
+
+            callableTasks.add(new ParallelFileReaderTask(start, (end - start),
+                    raf.getChannel().map(FileChannel.MapMode.READ_ONLY, start, (end - start))));
+            start = end;
+            end = Math.min(end + chunkSize, fileSize - 1);
+        }
+        List<Future<Map<ByteString, MeasurementAggregator>>> futures = executorService.invokeAll(callableTasks);
+        List<Map<ByteString, MeasurementAggregator>> resultList = new ArrayList<>(futures.size());
+        for (Future<Map<ByteString, MeasurementAggregator>> future : futures) {
+            resultList.add(future.get());
+        }
+
+        Map<String, MeasurementAggregator> resultMap = new TreeMap<>();
+        for (Map<ByteString, MeasurementAggregator> map : resultList) {
+            for (Map.Entry<ByteString, MeasurementAggregator> entry : map.entrySet()) {
+                MeasurementAggregator agg = resultMap.get(entry.getKey().toString());
+                if (agg == null) {
+                    agg = new MeasurementAggregator();
+                    resultMap.put(entry.getKey().toString(), agg);
+                }
+                agg.min = Math.min(agg.min, entry.getValue().min);
+                agg.max = Math.max(agg.max, entry.getValue().max);
+                agg.sum = agg.sum + entry.getValue().sum;
+                agg.count = agg.count + entry.getValue().count;
+            }
+        }
+        System.out.println(resultMap);
+        executorService.shutdown();
+        // System.out.println("Time taken: " + (System.currentTimeMillis() - st));
+    }
+
+    private static int findEolInLong(long word) {
+        return findPositionInLong(word, EOL_MASK);
+    }
+
+    private static int findSemiColonInLong(long word) {
+        return findPositionInLong(word, SEMI_COLON_MASK);
+    }
+
+    private static int findPositionInLong(long word, long searchMask) {
+        long maskedWord = word ^ searchMask;
+        long tmp = (maskedWord - 0x0101010101010101L) & ~maskedWord & 0x8080808080808080L;
+        return tmp == 0 ? -1 : (Long.numberOfLeadingZeros(tmp) >>> 3);
+    }
+
+    private static class ParallelFileReaderTask implements Callable<Map<ByteString, MeasurementAggregator>> {
+        private long start;
+        private int size;
+        private MappedByteBuffer mbf;
+        byte[] bytes;
+        private static final int BATCH_READ_SIZE = 64;
+        Map<ByteString, MeasurementAggregator> map;
+
+        public ParallelFileReaderTask(long start, long size, MappedByteBuffer mbf) {
+            this.start = start;
+            this.size = (int) size;
+            this.mbf = mbf;
+            this.bytes = new byte[BATCH_READ_SIZE];
+            this.map = new HashMap<>(10000);
+        }
+
+        @Override
+        public Map<ByteString, MeasurementAggregator> call() throws Exception {
+            int bytesReadTillNow = 0;
+            int startOfStation = 0, startOfNumber = -1, endOfStation = -1, endOfNumber = -1;
+            boolean isLastRead = false;
+            try {
+                while (bytesReadTillNow < this.size) {
+                    int semiColonIndex = -1;
+                    while (semiColonIndex == -1 && bytesReadTillNow < this.size) {
+                        long currentWord;
+                        try {
+                            currentWord = mbf.getLong();
+                        }
+                        catch (java.nio.BufferUnderflowException ex) {
+                            int remainingBytesCount = this.size - bytesReadTillNow;
+                            byte[] remainingBytes = ByteBuffer.allocate(8).putLong(0).array();
+                            mbf.get(bytesReadTillNow, remainingBytes, 0, remainingBytesCount);
+                            currentWord = ByteBuffer.wrap(remainingBytes).getLong();
+                        }
+                        semiColonIndex = findSemiColonInLong(currentWord);
+                        if (semiColonIndex > -1) {
+                            endOfStation = bytesReadTillNow + semiColonIndex;
+                            startOfNumber = bytesReadTillNow + semiColonIndex + 1;
+                            mbf.position(startOfNumber);
+                            bytesReadTillNow += semiColonIndex + 1;
+                        }
+                        else {
+                            bytesReadTillNow += 8;
+                        }
+                    }
+
+                    int stationLength = endOfStation - startOfStation;
+                    byte[] stationBytes = new byte[stationLength];
+                    mbf.get(startOfStation, stationBytes, 0, stationLength);
+
+                    int eolIndex = -1;
+                    while (eolIndex == -1 && bytesReadTillNow < this.size) {
+                        long currentWord;
+                        try {
+                            currentWord = mbf.getLong();
+                        }
+                        catch (java.nio.BufferUnderflowException ex) {
+                            int remainingBytesCount = this.size - bytesReadTillNow;
+                            byte[] remainingBytes = ByteBuffer.allocate(8).putLong(0).array();
+                            mbf.get(bytesReadTillNow, remainingBytes, 0, remainingBytesCount);
+                            currentWord = ByteBuffer.wrap(remainingBytes).getLong();
+                            isLastRead = true;
+                        }
+                        eolIndex = findEolInLong(currentWord);
+                        if (eolIndex > -1) {
+                            endOfNumber = bytesReadTillNow + eolIndex;
+                            startOfStation = bytesReadTillNow + eolIndex + 1;
+                            mbf.position(startOfStation);
+                            bytesReadTillNow += eolIndex + 1;
+                        }
+                        else {
+                            bytesReadTillNow += 8;
+                        }
+                        if (isLastRead) {
+                            bytesReadTillNow = this.size;
+                            if (eolIndex == -1) {
+                                endOfNumber = this.size;
+                            }
+                        }
+                    }
+
+                    int numberLength = endOfNumber - startOfNumber;
+                    byte[] numberBytes = new byte[numberLength];
+                    mbf.get(startOfNumber, numberBytes, 0, numberLength);
+
+                    Measurement measurement = new Measurement(new ByteString(stationBytes),
+                            getIntegerFromTemperatureBytes(numberBytes));
+                    MeasurementAggregator aggregator = this.map.get(measurement.station);
+                    if (aggregator == null) {
+                        aggregator = new MeasurementAggregator();
+                        this.map.put(measurement.station, aggregator);
+                    }
+                    aggregator.min = Math.min(aggregator.min, measurement.value);
+                    aggregator.max = Math.max(aggregator.max, measurement.value);
+                    aggregator.sum += measurement.value;
+                    aggregator.count++;
+                }
+            }
+            catch (Exception ex) {
+                throw ex;
+            }
+
+            return this.map;
+        }
+
+        private int getIntegerFromTemperatureBytes(byte[] numberBytes) {
+            int firstDigitIndex = (numberBytes[0] ^ NEGATIVE_SIGN_BYTE) == 0 ? 1 : 0;
+            int ret = 0;
+            for (int i = firstDigitIndex; i < numberBytes.length; i++) {
+                if ((numberBytes[i] ^ DOT_BYTE) != 0) {
+                    ret = (ret << 3) + (ret << 1) + ((int) numberBytes[i] - 48);
+                }
+            }
+            return (firstDigitIndex > 0) ? -ret : ret;
+        }
+    }
+}

From d8589590979f70f01d19b59ec018042b4ab59fdf Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Tue, 23 Jan 2024 19:38:32 +0100
Subject: [PATCH 112/268] parse value before going to map (#548)

parse value before going to map
---
 prepare_artsiomkorzun.sh                      |  2 +-
 .../CalculateAverage_artsiomkorzun.java       | 76 ++++++++++++-------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/prepare_artsiomkorzun.sh b/prepare_artsiomkorzun.sh
index 984048691..d1263addb 100755
--- a/prepare_artsiomkorzun.sh
+++ b/prepare_artsiomkorzun.sh
@@ -19,6 +19,6 @@ source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.2-graal 1>&2
 
 if [ ! -f target/CalculateAverage_artsiomkorzun_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=64m --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_artsiomkorzun"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=64m -H:-GenLoopSafepoints --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_artsiomkorzun"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_artsiomkorzun_image dev.morling.onebrc.CalculateAverage_artsiomkorzun
 fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index 40b8db05a..bb2198f5d 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -383,6 +383,13 @@ public void run() {
             }
         }
 
+        private static long next(long position) {
+            while (UNSAFE.getByte(position++) != '\n') {
+                // continue
+            }
+            return position;
+        }
+
         private static void aggregate(Aggregates aggregates, long position, long limit) {
             // this parsing can produce seg fault at page boundaries
             // e.g. file size is 4096 and the last entry is X=0.0, which is less than 8 bytes
@@ -392,18 +399,27 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
             while (position <= limit) { // branchy version, credit: thomaswue
                 int length;
                 int hash;
+                int value;
 
                 long word = word(position);
                 long separator = separator(word);
+                long end = position;
 
                 if (separator != 0) {
                     length = length(separator);
                     word = mask(word, separator);
                     hash = mix(word);
+                    end += length;
+
+                    long num = word(end);
+                    int dot = dot(num);
+                    value = value(num, dot);
+                    end += (dot >> 3) + 3;
                     long ptr = aggregates.find(word, hash);
 
                     if (ptr != 0) {
-                        position = update(ptr, position + length);
+                        Aggregates.update(ptr, value);
+                        position = end;
                         continue;
                     }
                 }
@@ -416,10 +432,17 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                         length = length(separator) + 8;
                         word = mask(word, separator);
                         hash = mix(word ^ word0);
+                        end += length;
+
+                        long num = word(end);
+                        int dot = dot(num);
+                        value = value(num, dot);
+                        end += (dot >> 3) + 3;
                         long ptr = aggregates.find(word0, word, hash);
 
                         if (ptr != 0) {
-                            position = update(ptr, position + length);
+                            Aggregates.update(ptr, value);
+                            position = end;
                             continue;
                         }
                     }
@@ -440,31 +463,23 @@ private static void aggregate(Aggregates aggregates, long position, long limit)
                             length += length(separator);
                             word = mask(word, separator);
                             hash = mix(h ^ word);
+                            end += length;
+
+                            long num = word(end);
+                            int dot = dot(num);
+                            value = value(num, dot);
+                            end += (dot >> 3) + 3;
                             break;
                         }
                     }
                 }
 
                 long ptr = aggregates.put(position, word, length, hash);
-                position = update(ptr, position + length);
+                Aggregates.update(ptr, value);
+                position = end;
             }
         }
 
-        private static long update(long ptr, long position) {
-            // idea: merykitty
-            long word = word(position);
-            long inverted = ~word;
-            int dot = Long.numberOfTrailingZeros(inverted & DOT_BITS);
-            long signed = (inverted << 59) >> 63;
-            long mask = ~(signed & 0xFF);
-            long digits = ((word & mask) << (28 - dot)) & 0x0F000F0F00L;
-            long abs = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
-            int value = (int) ((abs ^ signed) - signed);
-
-            Aggregates.update(ptr, value);
-            return position + (dot >> 3) + 3;
-        }
-
         private static long separator(long word) {
             long match = word ^ COMMA_PATTERN;
             return (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
@@ -479,17 +494,24 @@ private static int length(long separator) {
             return (Long.numberOfTrailingZeros(separator) >>> 3) + 1;
         }
 
-        private static long next(long position) {
-            while (UNSAFE.getByte(position++) != '\n') {
-                // continue
-            }
-            return position;
-        }
-
         private static int mix(long x) {
             long h = x * -7046029254386353131L;
-            h ^= h >>> 32;
-            return (int) (h ^ h >>> 16);
+            h ^= h >>> 35;
+            return (int) h;
+            // h ^= h >>> 32;
+            // return (int) (h ^ h >>> 16);
+        }
+
+        private static int dot(long num) {
+            return Long.numberOfTrailingZeros(~num & DOT_BITS);
+        }
+
+        private static int value(long w, int dot) {
+            long signed = (~w << 59) >> 63;
+            long mask = ~(signed & 0xFF);
+            long digits = ((w & mask) << (28 - dot)) & 0x0F000F0F00L;
+            long abs = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
+            return (int) ((abs ^ signed) - signed);
         }
     }
 }

From c886aaba3498fbc381a009d0f0a466a20194992b Mon Sep 17 00:00:00 2001
From: Parker Timmins <45302127+parkertimmins@users.noreply.github.com>
Date: Tue, 23 Jan 2024 12:43:34 -0600
Subject: [PATCH 113/268] Deploy v2 for parkertimmins (#524)

* Deploy v2 for parkertimmins

Main changes:
- fix hash which masked incorrectly
- do station equality check in simd
- make station array length multiple of 32
- search for newline rather than semicolon

* Fix bug - entries were being skipped between batches

At the boundary between two batches, the first batch would stop after
crossing a limit with a padding of 200 characters applied. The next
batch should then start looking for the first full entry after the
padding. This padding logic had been removed when starting a batch. For
this reason, entries starting in the 200 character padding between
batches were skipped.
---
 .../CalculateAverage_parkertimmins.java       | 175 ++++++++----------
 1 file changed, 78 insertions(+), 97 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java b/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java
index 71412fb78..c689ff1ad 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java
@@ -16,28 +16,21 @@
 package dev.morling.onebrc;
 
 import jdk.incubator.vector.ByteVector;
-import jdk.incubator.vector.VectorMask;
-import jdk.incubator.vector.VectorOperators;
 
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 
 import java.lang.foreign.ValueLayout;
-import java.lang.reflect.Array;
-import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.charset.StandardCharsets;
 import java.io.IOException;
 import java.io.RandomAccessFile;
-import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicLong;
-import java.util.zip.CRC32C;
 
 public class CalculateAverage_parkertimmins {
     private static final String FILE = "./measurements.txt";
-    // private static final String FILE = "./full_measurements.no_license";
 
     private static record ResultRow(double min, double mean, double max) {
         public String toString() {
@@ -51,14 +44,16 @@ private double round(double value) {
 
     static class OpenHashTable {
         static class Entry {
+
+            // key always stored as multiple of 32 bytes
             byte[] key;
-            short min;
-            short max;
+            byte keyLen;
+            short min = Short.MAX_VALUE;
+            short max = Short.MIN_VALUE;
             long sum = 0;
             long count = 0;
-            int hash;
 
-            void merge(OpenHashTable.Entry other) {
+            void merge(Entry other) {
                 min = (short) Math.min(min, other.min);
                 max = (short) Math.max(max, other.max);
                 sum += other.sum;
@@ -80,15 +75,20 @@ void add(byte[] buf, int sLen, short val, int hash) {
                 // key not present, so add it
                 if (entry == null) {
                     entry = entries[idx] = new Entry();
-                    entry.key = Arrays.copyOf(buf, sLen);
+
+                    int rem = sLen % 32;
+                    int arrayLen = rem == 0 ? sLen : sLen + 32 - rem;
+                    entry.key = Arrays.copyOf(buf, arrayLen);
+                    Arrays.fill(entry.key, sLen, arrayLen, (byte) 0);
+                    entry.keyLen = (byte) sLen;
+
                     entry.min = entry.max = val;
                     entry.sum += val;
                     entry.count++;
-                    entry.hash = hash;
                     break;
                 }
                 else {
-                    if (entry.hash == hash && entry.key.length == sLen && Arrays.equals(entry.key, 0, sLen, buf, 0, sLen)) {
+                    if (entry.keyLen == sLen && eq(buf, entry.key, entry.keyLen)) {
                         entry.min = (short) Math.min(entry.min, val);
                         entry.max = (short) Math.max(entry.max, val);
                         entry.sum += val;
@@ -103,6 +103,23 @@ void add(byte[] buf, int sLen, short val, int hash) {
         }
     }
 
+    static boolean eq(byte[] buf, byte[] entryKey, int sLen) {
+        int needed = sLen;
+        for (int offset = 0; offset <= 96; offset += 32) {
+            var a = ByteVector.fromArray(ByteVector.SPECIES_256, buf, offset);
+            var b = ByteVector.fromArray(ByteVector.SPECIES_256, entryKey, offset);
+            int matches = a.eq(b).not().firstTrue();
+            if (needed <= 32) {
+                return matches >= needed;
+            }
+            else if (matches < 32) {
+                return false;
+            }
+            needed -= 32;
+        }
+        return false;
+    }
+
     static long findNextEntryStart(MemorySegment ms, long offset) {
         long curr = offset;
         while (ms.get(ValueLayout.JAVA_BYTE, curr) != '\n') {
@@ -112,8 +129,17 @@ static long findNextEntryStart(MemorySegment ms, long offset) {
         return curr;
     }
 
-    static short[] digits10s = { 0, 100, 200, 300, 400, 500, 600, 700, 800, 900 };
-    static short[] digits1s = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 };
+    static short[] digits2s = new short[256];
+    static short[] digits1s = new short[256];
+    static short[] digits0s = new short[256];
+
+    static {
+        for (int i = 0; i < 10; ++i) {
+            digits2s[i + ((int) '0')] = (short) (i * 100);
+            digits1s[i + ((int) '0')] = (short) (i * 10);
+            digits0s[i + ((int) '0')] = (short) i;
+        }
+    }
 
     static void processRangeScalar(MemorySegment ms, long start, long end, final OpenHashTable localAgg) {
         byte[] buf = new byte[128];
@@ -139,9 +165,10 @@ static void processRangeScalar(MemorySegment ms, long start, long end, final Ope
             boolean neg = ms.get(ValueLayout.JAVA_BYTE, tempIdx) == '-';
             boolean twoDig = ms.get(ValueLayout.JAVA_BYTE, tempIdx + 1 + (neg ? 1 : 0)) == '.';
             int len = 3 + (neg ? 1 : 0) + (twoDig ? 0 : 1);
-            int d0 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 1)) - '0';
-            int d1 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 3)) - '0';
-            int base = d0 + digits1s[d1] + (twoDig ? 0 : digits10s[((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 4)) - '0']);
+            int d0 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 1));
+            int d1 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 3));
+            int d2 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 4)); // could be - or \n
+            int base = digits0s[d0] + digits1s[d1] + digits2s[d2];
             short temp = (short) (neg ? -base : base);
 
             localAgg.add(buf, sLen, temp, hash);
@@ -150,100 +177,55 @@ static void processRangeScalar(MemorySegment ms, long start, long end, final Ope
     }
 
     static int hash(byte[] buf, int sLen) {
-        // TODO find a hash that works directly from byte array
-        // if shorter than 8 chars, mask out upper bits
-        long mask = sLen < 8 ? -(1L << ((8 - sLen) << 3)) : 0xFFFFFFFFL;
-        long val = ((buf[0] & 0xffL) << 56) | ((buf[1] & 0xffL) << 48) | ((buf[2] & 0xffL) << 40) | ((buf[3] & 0xffL) << 32) | ((buf[4] & 0xffL) << 24)
-                | ((buf[5] & 0xffL) << 16) | ((buf[6] & 0xFFL) << 8) | (buf[7] & 0xffL);
+        int shift = Math.max(0, 8 - sLen) << 3;
+        long mask = (~0L) >>> shift;
+        long val = ((buf[7] & 0xffL) << 56) | ((buf[6] & 0xffL) << 48) | ((buf[5] & 0xffL) << 40) | ((buf[4] & 0xffL) << 32) | ((buf[3] & 0xffL) << 24)
+                | ((buf[2] & 0xffL) << 16) | ((buf[1] & 0xFFL) << 8) | (buf[0] & 0xffL);
         val &= mask;
-
-        // also worth trying: https://lemire.me/blog/2015/10/22/faster-hashing-without-effort/
         // lemire: https://lemire.me/blog/2023/07/14/recognizing-string-prefixes-with-simd-instructions/
         int hash = (int) (((((val >> 32) ^ val) & 0xffffffffL) * 3523216699L) >> 32);
         return hash;
     }
 
-    static void processRangeSIMD(MemorySegment ms, boolean frontPad, boolean backPad, long start, long end, final OpenHashTable localAgg) {
+    static void processRangeSIMD(MemorySegment ms, boolean isFirst, boolean isLast, long start, long end, final OpenHashTable localAgg) {
         byte[] buf = new byte[128];
 
-        long curr = frontPad ? findNextEntryStart(ms, start) : start;
-        long limit = end - padding;
+        long curr = isFirst ? start : findNextEntryStart(ms, start);
+        long limit = isLast ? end - padding : end;
 
-        var needle = ByteVector.broadcast(ByteVector.SPECIES_256, ';');
         while (curr < limit) {
-
-            int segStart = 0;
-            int sLen;
-
-            while (true) {
-                var section = ByteVector.fromMemorySegment(ByteVector.SPECIES_256, ms, curr + segStart, ByteOrder.LITTLE_ENDIAN);
-                section.intoArray(buf, segStart);
-                VectorMask<Byte> matches = section.compare(VectorOperators.EQ, needle);
-                int idx = matches.firstTrue();
+            int nl = 0;
+            for (int offset = 0; offset < 128; offset += 32) {
+                ByteVector section = ByteVector.fromMemorySegment(ByteVector.SPECIES_256, ms, curr + offset, ByteOrder.LITTLE_ENDIAN);
+                section.intoArray(buf, offset);
+                var idx = section.eq((byte) '\n').firstTrue();
                 if (idx != 32) {
-                    sLen = segStart + idx;
+                    nl = offset + idx;
                     break;
                 }
-                segStart += 32;
             }
 
-            int hash = hash(buf, sLen);
-
-            curr += sLen;
-            curr++; // semicolon
-
-            long tempIdx = curr;
-            boolean neg = ms.get(ValueLayout.JAVA_BYTE, tempIdx) == '-';
-            boolean twoDig = ms.get(ValueLayout.JAVA_BYTE, tempIdx + 1 + (neg ? 1 : 0)) == '.';
-            int len = 3 + (neg ? 1 : 0) + (twoDig ? 0 : 1);
-            int d0 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 1)) - '0';
-            int d1 = ((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 3)) - '0';
-            int base = d0 + digits1s[d1] + (twoDig ? 0 : digits10s[((char) ms.get(ValueLayout.JAVA_BYTE, tempIdx + len - 4)) - '0']);
+            int nl1 = buf[nl - 1];
+            int nl3 = buf[nl - 3];
+            int nl4 = buf[nl - 4];
+            int nl5 = buf[nl - 5];
+            int base = (nl1 - '0') + 10 * (nl3 - '0') + digits2s[nl4];
+            boolean neg = nl4 == '-' || (nl4 != ';' && nl5 == '-');
             short temp = (short) (neg ? -base : base);
+            int tempLen = 4 + (neg ? 1 : 0) + (base >= 100 ? 1 : 0);
+            int semi = nl - tempLen;
 
-            localAgg.add(buf, sLen, temp, hash);
-            curr = tempIdx + len + 1;
+            int hash = hash(buf, semi);
+            localAgg.add(buf, semi, temp, hash);
+            curr += (nl + 1);
         }
 
         // last batch is near end of file, process without SIMD to avoid out-of-bounds
-        if (!backPad) {
+        if (isLast) {
             processRangeScalar(ms, curr, end, localAgg);
         }
     }
 
-    /**
-     *  For debugging issues with hash function
-      */
-    static void checkHashDistributionQuality(ArrayList<OpenHashTable> localAggs) {
-        HashSet<Integer> uniquesHashValues = new HashSet<Integer>();
-        HashSet<String> uniqueCities = new HashSet<String>();
-        HashMap<String, HashSet<Integer>> cityToHash = new HashMap<>();
-
-        for (var agg : localAggs) {
-            for (OpenHashTable.Entry entry : agg.entries) {
-                if (entry == null) {
-                    continue;
-                }
-                uniquesHashValues.add(entry.hash);
-                String station = new String(entry.key, StandardCharsets.UTF_8); // for UTF-8 encoding
-                uniqueCities.add(station);
-
-                if (!cityToHash.containsKey(station)) {
-                    cityToHash.put(station, new HashSet<>());
-                }
-                cityToHash.get(station).add(entry.hash);
-            }
-        }
-
-        for (var pair : cityToHash.entrySet()) {
-            if (pair.getValue().size() > 1) {
-                System.err.println("multiple hashes: " + pair.getKey() + " " + pair.getValue());
-            }
-        }
-
-        System.err.println("Unique stations: " + uniqueCities.size() + ", unique hash values: " + uniquesHashValues.size());
-    }
-
     /**
      * Combine thread local values
      */
@@ -254,7 +236,7 @@ static HashMap<String, OpenHashTable.Entry> mergeAggregations(ArrayList<OpenHash
                 if (entry == null) {
                     continue;
                 }
-                String station = new String(entry.key, StandardCharsets.UTF_8); // for UTF-8 encoding
+                String station = new String(entry.key, 0, entry.keyLen, StandardCharsets.UTF_8); // for UTF-8 encoding
                 var currentVal = global.get(station);
                 if (currentVal != null) {
                     currentVal.merge(entry);
@@ -267,8 +249,6 @@ static HashMap<String, OpenHashTable.Entry> mergeAggregations(ArrayList<OpenHash
         return global;
     }
 
-    static final long batchSize = 10_000_000;
-
     static final int padding = 200; // max entry size is 107ish == 100 (station) + 1 (semicolon) + 5 (temp, eg -99.9) + 1 (newline)
 
     public static void main(String[] args) throws IOException, InterruptedException {
@@ -277,7 +257,10 @@ public static void main(String[] args) throws IOException, InterruptedException
 
         int numThreads = Runtime.getRuntime().availableProcessors();
 
+        final long batchSize = 10_000_000;
+
         final long fileSize = channel.size();
+        // final long batchSize = fileSize / numThreads + 1;
         final MemorySegment ms = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
         final ArrayList<OpenHashTable> localAggs = new ArrayList<>(numThreads);
         Thread[] threads = new Thread[numThreads];
@@ -299,11 +282,9 @@ public void run() {
                         break;
                     }
                     final long endBatch = Math.min(startBatch + batchSize, fileSize);
-                    final boolean first = startBatch == 0;
-                    final boolean frontPad = !first;
-                    final boolean last = endBatch == fileSize;
-                    final boolean backPad = !last;
-                    processRangeSIMD(ms, frontPad, backPad, startBatch, endBatch, localAgg);
+                    final boolean isFirstBatch = startBatch == 0;
+                    final boolean isLastBatch = endBatch == fileSize;
+                    processRangeSIMD(ms, isFirstBatch, isLastBatch, startBatch, endBatch, localAgg);
                 }
             }
         }

From b3420d93483ab47b2df6934f816ce6af254adea7 Mon Sep 17 00:00:00 2001
From: Yann Moisan <yamo93@gmail.com>
Date: Tue, 23 Jan 2024 19:58:10 +0100
Subject: [PATCH 114/268] improvements (#521)

- inline computeIfAbsent
- replace arraycopy by copyOfRange

Co-authored-by: Yann Moisan <yann@zen.ly>
---
 .../onebrc/CalculateAverage_YannMoisan.java     | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java
index 0e9b5cfc4..03370e67b 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java
@@ -104,10 +104,7 @@ private static Map<Location, Stat> parse(ByteBuffer bb) {
                     break;
                 field[fieldCurrentIndex++] = fieldByte;
             }
-            var dst = new byte[fieldCurrentIndex];
-            System.arraycopy(field, 0, dst, 0, fieldCurrentIndex);
-            var fieldStr = new Location(dst);
-            // System.arraycopy(field, 0, dst, 0, fieldCurrentIndex);
+            var fieldStr = new Location(Arrays.copyOfRange(field, 0, fieldCurrentIndex));
             var number = 0;
             var sign = 1;
             while (bb.position() < limit) {
@@ -119,9 +116,15 @@ else if (numberByte == '\n')
                 else if (numberByte != '.')
                     number = number * 10 + (numberByte - '0');
             }
-            stats.computeIfAbsent(fieldStr,
-                    k -> new Stat())
-                    .update(sign * number);
+            var v = stats.get(fieldStr);
+            if (v == null) {
+                var vv = new Stat();
+                vv.update(sign * number);
+                stats.put(fieldStr, vv);
+            }
+            else {
+                v.update(sign * number);
+            }
         }
 
         return stats;

From ba793e88cd3c1b7767e00180c721b85cf5c50e28 Mon Sep 17 00:00:00 2001
From: yourwass <157275797+yourwass@users.noreply.github.com>
Date: Tue, 23 Jan 2024 21:04:55 +0200
Subject: [PATCH 115/268] Add Yourwass take on the challenge (#532)

* Uses vector api for city name parsing and for hash index collision resolution
* Uses lookup tables for temperature parsing
---
 calculate_average_yourwass.sh                 |  23 ++
 .../onebrc/CalculateAverage_yourwass.java     | 288 ++++++++++++++++++
 2 files changed, 311 insertions(+)
 create mode 100755 calculate_average_yourwass.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java

diff --git a/calculate_average_yourwass.sh b/calculate_average_yourwass.sh
new file mode 100755
index 000000000..07284ba76
--- /dev/null
+++ b/calculate_average_yourwass.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
+
+JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules jdk.incubator.vector"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_yourwass
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java b/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java
new file mode 100644
index 000000000..0a24b0a7e
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java
@@ -0,0 +1,288 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.util.TreeMap;
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.nio.charset.StandardCharsets;
+import java.nio.ByteOrder;
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import sun.misc.Unsafe;
+
+public class CalculateAverage_yourwass {
+
+    static final class Record {
+        public String city;
+        public long cityAddr;
+        public long cityLength;
+        public int min;
+        public int max;
+        public int count;
+        public long sum;
+
+        Record(final long cityAddr, final long cityLength) {
+            this.city = null;
+            this.cityAddr = cityAddr;
+            this.cityLength = cityLength;
+            this.min = 1000;
+            this.max = -1000;
+            this.sum = 0;
+            this.count = 0;
+        }
+
+        private Record merge(Record r) {
+            if (r.min < this.min)
+                this.min = r.min;
+            if (r.max > this.max)
+                this.max = r.max;
+            this.sum += r.sum;
+            this.count += r.count;
+            return this;
+        }
+    }
+
+    private static short lookupDecimal[];
+    private static byte lookupFraction[];
+    private static byte lookupDotPositive[];
+    private static byte lookupDotNegative[];
+    private static MemorySegment VAS;
+    private static final VectorSpecies<Byte> SPECIES = ByteVector.SPECIES_PREFERRED;
+    private static final int MAXINDEX = (1 << 16) + 10000; // short hash + max allowed cities for collisions at the end :p
+    private static final String FILE = "measurements.txt";
+    private static final Unsafe UNSAFE = getUnsafe();
+
+    private static Unsafe getUnsafe() {
+        try {
+            final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            Unsafe unsafe = (Unsafe) theUnsafe.get(null);
+            return unsafe;
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static void main(String[] args) throws IOException, Throwable {
+        // prepare lookup tables
+        // the parsing reads two shorts after possible '-'
+        // first short, the Decimal part, can be N. or NN with N:[0..9]
+        // second short, the Fraction part, can be N\n or .N
+        lookupDecimal = new short[('9' << 8) + '9' + 1];
+        lookupFraction = new byte[('9' << 8) + '.' + 1];
+        lookupDotPositive = new byte[('9' << 8) + '.' + 1];
+        lookupDotNegative = new byte[('9' << 8) + '.' + 1];
+        for (short i = 0; i < 10; i++) {
+            final int ones = i * 10;
+            final int ix256 = i << 8;
+            // case N. i.e. single digit decimals: skip to 11824 = ('.'<<8)+'0'
+            lookupDecimal[11824 + i] = (short) ones;
+            for (short j = 1; j < 10; j++) {
+                // case NN i.e double digits decimals: skip to 12236 = ('0'<<8)+'0'
+                lookupDecimal[12336 + ix256 + j] = (short) (j * 100 + ones);
+            }
+            // case N\n skip to 2608 = ('\n'<<8)+'0'
+            lookupFraction[2608 + i] = (byte) i;
+            lookupDotPositive[2608 + i] = 4;
+            lookupDotNegative[2608 + i] = 5;
+            // case .N skip to 12334 = ('0'<<8)+'.'
+            lookupFraction[12334 + ix256] = (byte) i;
+            lookupDotPositive[12334 + ix256] = 5;
+            lookupDotNegative[12334 + ix256] = 6;
+        }
+
+        // open file
+        final long fileSize, mmapAddr;
+        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+            fileSize = fileChannel.size();
+            mmapAddr = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
+        }
+        // VAS: Virtual Address Space, as a MemorySegment upto and including the mmaped file.
+        // If the mmaped MemorySegment is used for Vector creation as is, then there are two problems:
+        // 1) fromMemorySegment takes an offset and not an address, so we have to do arithmetic
+        // this is solved by creating a MemorySegment from Address=0
+        // 2) fromMemorySegment checks bounds for memory segment's size - Vector size
+        // this is solved by adding SPECIES.length() to the size of the segment, but
+        // XXX there lies the possibility for an out of bounds read at the end of file, which is not handled here.
+        VAS = MemorySegment.ofAddress(0).reinterpret(mmapAddr + fileSize + SPECIES.length());
+
+        // start and wait for threads to finish
+        final int nThreads = Runtime.getRuntime().availableProcessors();
+        Thread[] threadList = new Thread[nThreads];
+        final Record[][] results = new Record[nThreads][];
+        final long chunkSize = fileSize / nThreads;
+        for (int i = 0; i < nThreads; i++) {
+            final int threadIndex = i;
+            final long startAddr = mmapAddr + i * chunkSize;
+            final long endAddr = (i == nThreads - 1) ? mmapAddr + fileSize : mmapAddr + (i + 1) * chunkSize;
+            threadList[i] = new Thread(() -> results[threadIndex] = threadMain(threadIndex, startAddr, endAddr, nThreads));
+            threadList[i].start();
+        }
+        for (int i = 0; i < nThreads; i++)
+            threadList[i].join();
+
+        // aggregate results and sort
+        // TODO have to compare with concurrent-parallel stream structures:
+        // * concurrent hashtable that have to sort afterwards
+        // * concurrent skiplist that is sorted but has O(n) insert
+        // * ..other?
+        final TreeMap<String, Record> aggregateResults = new TreeMap<>();
+        for (int thread = 0; thread < nThreads; thread++) {
+            for (int index = 0; index < MAXINDEX; index++) {
+                Record record = results[thread][index];
+                if (record == null)
+                    continue;
+                aggregateResults.compute(record.city, (k, v) -> (v == null) ? record : v.merge(record));
+            }
+        }
+
+        // prepare string and print
+        StringBuilder sb = new StringBuilder();
+        sb.append("{");
+        for (var entry : aggregateResults.entrySet()) {
+            Record record = entry.getValue();
+            float min = record.min;
+            min /= 10.f;
+            float max = record.max;
+            max /= 10.f;
+            double avg = Math.round((record.sum * 1.0) / record.count) / 10.;
+            sb.append(record.city).append("=").append(min).append("/").append(avg).append("/").append(max).append(", ");
+        }
+        int stringLength = sb.length();
+        sb.setCharAt(stringLength - 2, '}');
+        sb.setCharAt(stringLength - 1, '\n');
+        System.out.print(sb.toString());
+    }
+
+    private static final boolean citiesDiffer(final long a, final long b, final long len) {
+        int part = 0;
+        for (; part < (len - 1) >> 3; part++)
+            if (UNSAFE.getLong(a + (part << 3)) != UNSAFE.getLong(b + (part << 3)))
+                return true;
+        if (((UNSAFE.getLong(a + (part << 3)) ^ (UNSAFE.getLong(b + (part << 3)))) << ((8 - (len & 7)) << 3)) != 0)
+            return true;
+        return false;
+    }
+
+    private static Record[] threadMain(int id, long startAddr, long endAddr, long nThreads) {
+        // snap to newlines
+        if (id != 0)
+            while (UNSAFE.getByte(startAddr++) != '\n')
+                ;
+        if (id != nThreads - 1)
+            while (UNSAFE.getByte(endAddr++) != '\n')
+                ;
+
+        final Record[] results = new Record[MAXINDEX];
+        final long VECTORBYTESIZE = SPECIES.length();
+        final ByteOrder BYTEORDER = ByteOrder.nativeOrder();
+        final ByteVector delim = ByteVector.broadcast(SPECIES, ';');
+        long nextCityAddr = startAddr; // XXX from these three variables,
+        long cityAddr = nextCityAddr; // only two are necessary, but if one
+        long ptr = 0; // is eliminated, on my pc the benchmark gets worse..
+        while (nextCityAddr < endAddr) {
+            // parse city
+            long mask = ByteVector.fromMemorySegment(SPECIES, VAS, nextCityAddr + ptr, BYTEORDER)
+                    .compare(VectorOperators.EQ, delim).toLong();
+            if (mask == 0) {
+                ptr += VECTORBYTESIZE;
+                continue;
+            }
+            final long cityLength = ptr + Long.numberOfTrailingZeros(mask);
+            final long tempAddr = cityAddr + cityLength + 1;
+
+            // compute hash table index
+            int index;
+            if (cityLength > 1)
+                index = (UNSAFE.getByte(cityAddr) // mix the first,
+                        ^ (UNSAFE.getByte(cityAddr + 2) << 4) // the third (even if it is the delimiter ';')
+                        ^ (UNSAFE.getByte(tempAddr - 2) << 8) // and the last two bytes of each city's name
+                        ^ (UNSAFE.getByte(tempAddr - 3) << 12))
+                        & 0xFFFF;
+            else
+                index = (UNSAFE.getByte(cityAddr) << 8) & 0xFF00;
+
+            // resolve collisions with linear probing
+            // use vector api here also, but only if city name fits in one vector length, for faster default case
+            Record record = results[index];
+            if (cityLength <= VECTORBYTESIZE) {
+                ByteVector parsed = ByteVector.fromMemorySegment(SPECIES, VAS, cityAddr, BYTEORDER);
+                while (record != null) {
+                    if (cityLength == record.cityLength) {
+                        long sameMask = ByteVector.fromMemorySegment(SPECIES, VAS, record.cityAddr, BYTEORDER)
+                                .compare(VectorOperators.EQ, parsed).toLong();
+                        if (Long.numberOfTrailingZeros(~sameMask) >= cityLength)
+                            break;
+                    }
+                    record = results[++index];
+                }
+            }
+            else { // slower normal case for city names with length > VECTORBYTESIZE
+                while (record != null && (cityLength != record.cityLength || citiesDiffer(record.cityAddr, cityAddr, cityLength)))
+                    record = results[++index];
+            }
+
+            // add record for new keys
+            // TODO have to avoid memory allocations on hot path
+            if (record == null) {
+                results[index] = new Record(cityAddr, cityLength);
+                record = results[index];
+            }
+
+            // parse temp with lookup tables
+            int temp;
+            if (UNSAFE.getByte(tempAddr) == '-') {
+                temp = -lookupDecimal[UNSAFE.getShort(tempAddr + 1)] - lookupFraction[UNSAFE.getShort(tempAddr + 3)];
+                nextCityAddr = tempAddr + lookupDotNegative[UNSAFE.getShort(tempAddr + 3)];
+            }
+            else {
+                temp = lookupDecimal[UNSAFE.getShort(tempAddr)] + lookupFraction[UNSAFE.getShort(tempAddr + 2)];
+                nextCityAddr = tempAddr + lookupDotPositive[UNSAFE.getShort(tempAddr + 2)];
+            }
+            cityAddr = nextCityAddr;
+            ptr = 0;
+
+            // merge record
+            if (temp < record.min)
+                record.min = temp;
+            if (temp > record.max)
+                record.max = temp;
+            record.sum += temp;
+            record.count += 1;
+        }
+
+        // create strings from raw data
+        // TODO should avoid this copy
+        byte b[] = new byte[100];
+        for (int i = 0; i < MAXINDEX; i++) {
+            Record r = results[i];
+            if (r == null)
+                continue;
+            UNSAFE.copyMemory(null, r.cityAddr, b, Unsafe.ARRAY_BYTE_BASE_OFFSET, r.cityLength);
+            r.city = new String(b, 0, (int) r.cityLength, StandardCharsets.UTF_8);
+        }
+        return results;
+    }
+
+}

From 6c0949969a5df71df85397ce3338f0154e6727ac Mon Sep 17 00:00:00 2001
From: Roman Musin <995612+roman-r-m@users.noreply.github.com>
Date: Tue, 23 Jan 2024 19:19:07 +0000
Subject: [PATCH 116/268] Native image + a few smaller optimisations (#564)

* Inline parsing name and station to avoid constantly updating the offset field (-100ms)

* Remove Worker class, inline the logic into lambda

* Accumulate results in an int matrix instead of using result row (-50ms)

* Use native image
---
 calculate_average_roman-r-m.sh                |  13 +-
 prepare_roman-r-m.sh                          |   9 +
 .../onebrc/CalculateAverage_roman_r_m.java    | 247 ++++++++----------
 3 files changed, 135 insertions(+), 134 deletions(-)

diff --git a/calculate_average_roman-r-m.sh b/calculate_average_roman-r-m.sh
index b5d0b3d7a..acf9864ca 100755
--- a/calculate_average_roman-r-m.sh
+++ b/calculate_average_roman-r-m.sh
@@ -21,4 +21,15 @@ JAVA_OPTS="--enable-preview -XX:+UseTransparentHugePages"
 # see https://stackoverflow.com/questions/58087596/why-are-repeated-memory-allocations-observed-to-be-slower-using-epsilon-vs-g1
 JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xmx1G -Xms1G -XX:+AlwaysPreTouch"
 
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_roman_r_m
+if [ -f target/CalculateAverage_roman_r_m_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_roman_r_m_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_roman_r_m_image
+else
+    JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
+    if [[ ! "$(uname -s)" = "Darwin" ]]; then
+        # On OS/X, my machine, this errors:
+        JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+    fi
+    echo "Choosing to run the app in JVM mode as no native image was found, use additional_build_step_roman_r_m.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_roman_r_m
+fi
diff --git a/prepare_roman-r-m.sh b/prepare_roman-r-m.sh
index f83a3ff69..a0593b2cf 100755
--- a/prepare_roman-r-m.sh
+++ b/prepare_roman-r-m.sh
@@ -17,3 +17,12 @@
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.1-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_roman_r_m_image ]; then
+
+    JAVA_OPTS="--enable-preview -dsa"
+    NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_roman_r_m --gc=epsilon -Ob -O3 -march=native --strict-image-heap $JAVA_OPTS"
+
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_roman_r_m_image dev.morling.onebrc.CalculateAverage_roman_r_m
+fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
index 1a43ae5ef..896616d02 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
@@ -64,119 +64,6 @@ static long nextNewline(long from, MemorySegment ms) {
         return start + Long.numberOfTrailingZeros(i) / 8;
     }
 
-    static class Worker {
-        private final MemorySegment ms;
-        private final long end;
-        private long offset;
-
-        public Worker(FileChannel channel, long start, long end) {
-            try {
-                this.ms = channel.map(FileChannel.MapMode.READ_ONLY, start, end - start, Arena.ofConfined());
-                this.offset = ms.address();
-                this.end = ms.address() + end - start;
-            }
-            catch (Exception e) {
-                throw new RuntimeException(e);
-            }
-        }
-
-        private void parseName(ByteString station) {
-            long start = offset;
-            long next = UNSAFE.getLong(offset);
-            long pattern = applyPattern(next, SEMICOLON_MASK);
-            int bytes;
-            if (pattern != 0) {
-                bytes = Long.numberOfTrailingZeros(pattern) / 8;
-                offset += bytes;
-                long h = Long.reverseBytes(next) >>> (8 * (8 - bytes));
-                station.hash = (int) (h ^ (h >>> 32));
-            }
-            else {
-                long h = next;
-                station.hash = (int) (h ^ (h >>> 32));
-                while (pattern == 0) {
-                    offset += 8;
-                    next = UNSAFE.getLong(offset);
-                    pattern = applyPattern(next, SEMICOLON_MASK);
-                }
-                bytes = Long.numberOfTrailingZeros(pattern) / 8;
-                offset += bytes;
-            }
-
-            int len = (int) (offset - start);
-            station.offset = start;
-            station.len = len;
-            station.tail = next & ((1L << (8 * bytes)) - 1);
-
-            offset++;
-        }
-
-        int parseNumberFast() {
-            long encodedVal = UNSAFE.getLong(offset);
-
-            int neg = 1 - Integer.bitCount((int) (encodedVal & 0x10));
-            encodedVal >>>= 8 * neg;
-
-            var len = applyPattern(encodedVal, DOT_MASK);
-            len = Long.numberOfTrailingZeros(len) / 8;
-
-            encodedVal ^= broadcast((byte) 0x30);
-
-            int intPart = (int) (encodedVal & ((1 << (8 * len)) - 1));
-            intPart <<= 8 * (2 - len);
-            intPart *= (100 * 256 + 10);
-            intPart = (intPart & 0x3FF80) >>> 8;
-
-            int frac = (int) ((encodedVal >>> (8 * (len + 1))) & 0xFF);
-
-            offset += neg + len + 3; // 1 for . + 1 for fractional part + 1 for new line char
-            int sign = 1 - 2 * neg;
-            int val = intPart + frac;
-            return sign * val;
-        }
-
-        int parseNumberSlow() {
-            int neg = 1 - Integer.bitCount(UNSAFE.getByte(offset) & 0x10);
-            offset += neg;
-
-            int val = UNSAFE.getByte(offset++) - '0';
-            byte b;
-            while ((b = UNSAFE.getByte(offset++)) != '.') {
-                val = val * 10 + (b - '0');
-            }
-            b = UNSAFE.getByte(offset);
-            val = val * 10 + (b - '0');
-            offset += 2;
-            val *= 1 - 2 * neg;
-            return val;
-        }
-
-        int parseNumber() {
-            if (end - offset >= 8) {
-                return parseNumberFast();
-            }
-            else {
-                return parseNumberSlow();
-            }
-        }
-
-        public TreeMap<String, ResultRow> run() {
-            var resultStore = new ResultStore();
-            var station = new ByteString(ms);
-
-            while (offset < end) {
-                parseName(station);
-                long val = parseNumber();
-                var a = resultStore.get(station);
-                a.min = Math.min(a.min, val);
-                a.max = Math.max(a.max, val);
-                a.sum += val;
-                a.count++;
-            }
-            return resultStore.toMap();
-        }
-    }
-
     public static void main(String[] args) throws Exception {
         Field f = Unsafe.class.getDeclaredField("theUnsafe");
         f.setAccessible(true);
@@ -200,12 +87,97 @@ public static void main(String[] args) throws Exception {
         var result = IntStream.range(0, numThreads)
                 .parallel()
                 .mapToObj(i -> {
-                    long start = i == 0 ? 0 : bounds[i - 1] + 1;
-                    long end = bounds[i];
-                    Worker worker = new Worker(channel, start, end);
-                    var res = worker.run();
-                    worker.ms.unload();
-                    return res;
+                    try {
+                        long segmentStart = i == 0 ? 0 : bounds[i - 1] + 1;
+                        long segmentEnd = bounds[i];
+                        var segment = channel.map(FileChannel.MapMode.READ_ONLY, segmentStart, segmentEnd - segmentStart, Arena.ofConfined());
+
+                        var resultStore = new ResultStore();
+                        var station = new ByteString(segment);
+                        long offset = segment.address();
+                        long end = offset + segment.byteSize();
+                        while (offset < end) {
+                            // parsing station name
+                            long start = offset;
+                            long next = UNSAFE.getLong(offset);
+                            long pattern = applyPattern(next, SEMICOLON_MASK);
+                            int bytes;
+                            if (pattern != 0) {
+                                bytes = Long.numberOfTrailingZeros(pattern) / 8;
+                                offset += bytes;
+                                long h = Long.reverseBytes(next) >>> (8 * (8 - bytes));
+                                station.hash = (int) (h ^ (h >>> 32));
+                            }
+                            else {
+                                long h = next;
+                                station.hash = (int) (h ^ (h >>> 32));
+                                while (pattern == 0) {
+                                    offset += 8;
+                                    next = UNSAFE.getLong(offset);
+                                    pattern = applyPattern(next, SEMICOLON_MASK);
+                                }
+                                bytes = Long.numberOfTrailingZeros(pattern) / 8;
+                                offset += bytes;
+                            }
+
+                            int len = (int) (offset - start);
+                            station.offset = start;
+                            station.len = len;
+                            station.tail = next & ((1L << (8 * bytes)) - 1);
+
+                            offset++;
+
+                            // parsing temperature
+                            // TODO next may contain temperature as well, maybe try using it if we know the full number is there
+                            // 8 - bytes >= 5 -> bytes <= 3
+                            long val;
+                            if (end - offset >= 8) {
+                                long encodedVal = UNSAFE.getLong(offset);
+
+                                int neg = 1 - Integer.bitCount((int) (encodedVal & 0x10));
+                                encodedVal >>>= 8 * neg;
+
+                                long numLen = applyPattern(encodedVal, DOT_MASK);
+                                numLen = Long.numberOfTrailingZeros(numLen) / 8;
+
+                                encodedVal ^= broadcast((byte) 0x30);
+
+                                int intPart = (int) (encodedVal & ((1 << (8 * numLen)) - 1));
+                                intPart <<= 8 * (2 - numLen);
+                                intPart *= (100 * 256 + 10);
+                                intPart = (intPart & 0x3FF80) >>> 8;
+
+                                int frac = (int) ((encodedVal >>> (8 * (numLen + 1))) & 0xFF);
+
+                                offset += neg + numLen + 3; // 1 for . + 1 for fractional part + 1 for new line char
+                                int sign = 1 - 2 * neg;
+                                val = sign * (intPart + frac);
+                            }
+                            else {
+                                int neg = 1 - Integer.bitCount(UNSAFE.getByte(offset) & 0x10);
+                                offset += neg;
+
+                                val = UNSAFE.getByte(offset++) - '0';
+                                byte b;
+                                while ((b = UNSAFE.getByte(offset++)) != '.') {
+                                    val = val * 10 + (b - '0');
+                                }
+                                b = UNSAFE.getByte(offset);
+                                val = val * 10 + (b - '0');
+                                offset += 2;
+                                val *= 1 - (2L * neg);
+                            }
+
+                            resultStore.update(station, (int) val);
+                        }
+
+                        segment.unload();
+
+                        return resultStore.toMap();
+                    }
+                    catch (Exception e) {
+                        throw new RuntimeException(e);
+                    }
                 }).reduce((m1, m2) -> {
                     m2.forEach((k, v) -> m1.merge(k, v, ResultRow::merge));
                     return m1;
@@ -275,10 +247,17 @@ public String toString() {
     }
 
     private static final class ResultRow {
-        long min = 1000;
-        long sum = 0;
-        long max = -1000;
-        int count = 0;
+        long min;
+        long sum;
+        long max;
+        int count;
+
+        public ResultRow(int[] values) {
+            min = values[0];
+            max = values[1];
+            sum = values[2];
+            count = values[3];
+        }
 
         public String toString() {
             return round(min / 10.0) + "/" + round(sum / 10.0 / count) + "/" + round(max / 10.0);
@@ -300,9 +279,9 @@ public ResultRow merge(ResultRow other) {
     static class ResultStore {
         private static final int SIZE = 16384;
         private final ByteString[] keys = new ByteString[SIZE];
-        private final ResultRow[] values = new ResultRow[SIZE];
+        private final int[][] values = new int[SIZE][];
 
-        ResultRow get(ByteString s) {
+        void update(ByteString s, int value) {
             int h = s.hashCode();
             int idx = (SIZE - 1) & h;
 
@@ -311,18 +290,20 @@ ResultRow get(ByteString s) {
                 i++;
                 idx = (idx + i * i) % SIZE;
             }
-            ResultRow result;
             if (keys[idx] == null) {
                 keys[idx] = s.copy();
-                result = new ResultRow();
-                values[idx] = result;
+                values[idx] = new int[4];
+                values[idx][0] = value;
+                values[idx][1] = value;
+                values[idx][2] = value;
+                values[idx][3] = 1;
             }
             else {
-                result = values[idx];
-                // TODO see it it makes any difference
-                // keys[idx].offset = s.offset;
+                values[idx][0] = Math.min(values[idx][0], value);
+                values[idx][1] = Math.max(values[idx][1], value);
+                values[idx][2] += value;
+                values[idx][3] += 1;
             }
-            return result;
         }
 
         TreeMap<String, ResultRow> toMap() {
@@ -330,7 +311,7 @@ TreeMap<String, ResultRow> toMap() {
             var result = new TreeMap<String, ResultRow>();
             for (int i = 0; i < SIZE; i++) {
                 if (keys[i] != null) {
-                    result.put(keys[i].asString(buf), values[i]);
+                    result.put(keys[i].asString(buf), new ResultRow(values[i]));
                 }
             }
             return result;

From 292edc629f8092112ec75f2950e80e289a6dd745 Mon Sep 17 00:00:00 2001
From: karthikeyan97 <skarthikeyan046@gmail.com>
Date: Wed, 24 Jan 2024 00:51:52 +0530
Subject: [PATCH 117/268] fine tuning performance further (#526)

* final comit

changing using mappedbytebuffer

changes before using unsafe address

using unsafe

* using graalvm,correct unsafe mem implementation

---------

Co-authored-by: Karthikeyans <karthikeyan.sn@zohocorp.com>
---
 calculate_average_karthikeyan97.sh            |   8 +-
 .../CalculateAverage_karthikeyan97.java       | 101 ++++++++----------
 2 files changed, 50 insertions(+), 59 deletions(-)

diff --git a/calculate_average_karthikeyan97.sh b/calculate_average_karthikeyan97.sh
index bbad1c4d0..cca36e97a 100755
--- a/calculate_average_karthikeyan97.sh
+++ b/calculate_average_karthikeyan97.sh
@@ -15,15 +15,15 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="-Xms20480m -Xmx40960m "
+JAVA_OPTS="-Xms10240m -Xmx40960m "
 
 if [ -f target/CalculateAverage_karthikeyan97_image ]; then
     #echo "Picking up existing native image 'target/CalculateAverage_karthikeyan97_image', delete the file to select JVM mode." 1>&2
-    target/CalculateAverage_karthikeyan97_image -Xms20480m -Xmx32768m
+    target/CalculateAverage_karthikeyan97_image -Xms10240m -Xmx40960m
 else
-    JAVA_OPTS="--enable-preview"
     #echo "Chosing to run the app in JVM mode as no native image was found, use prepare_karthikeyan97.sh to generate." 1>&2
-    java -Xms20480m -Xmx32768m --enable-preview --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_karthikeyan97
+    java -Xms10240m -Xmx40960m  --enable-preview --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_karthikeyan97
+
 fi
 
 
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java b/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
index 7014b120b..de151c1a5 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java
@@ -125,7 +125,7 @@ public static void main(String[] args) throws Exception {
         final long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, raf.length(), Arena.global()).address();
         long length = raf.length();
         final long endAddress = mappedAddress + length - 1;
-        int cores = length > 1000 ? Runtime.getRuntime().availableProcessors() * 2 : 1;
+        int cores = length > 1000 ? Runtime.getRuntime().availableProcessors() : 1;
         long boundary[][] = new long[cores][2];
         long segments = length / (cores);
         long before = -1;
@@ -145,25 +145,21 @@ public static void main(String[] args) throws Exception {
         boundary[cores - 1][0] = before + 1;
         boundary[cores - 1][1] = length - 1;
 
-        Field f = Unsafe.class.getDeclaredField("theUnsafe");
-        f.setAccessible(true);
-        Unsafe unsafe = (Unsafe) f.get(null);
-
-        int l3Size = (13 * 1024 * 1024);// unsafe.l3Size();
+        int l3Size = (12 * 1024 * 1024);// unsafe.l3Size();
 
         System.out.println(new TreeMap((Arrays.stream(boundary).parallel().map(i -> {
-            FileInputStream fileInputStream = null;
             try {
                 int seglen = (int) (i[1] - i[0] + 1);
-                HashMap<modifiedbytearray, MeasurementAggregator> resultmap = new HashMap<>(1000);
+                HashMap<modifiedbytearray, MeasurementAggregator> resultmap = new HashMap<>(4000);
                 long segstart = mappedAddress + i[0];
                 int bytesRemaining = seglen;
                 long num = 0;
-                int sign = 1;
                 boolean isNumber = false;
                 byte bi;
+                int sign = 1;
                 modifiedbytearray stationName = null;
                 int hascode = 5381;
+                // System.out.println("start:" + System.nanoTime() / 1000000);
                 while (bytesRemaining > 0) {
                     int bytesptr = 0;
                     // int bytesread = buffer.remaining() > l3Size ? l3Size : buffer.remaining();
@@ -178,64 +174,59 @@ public static void main(String[] args) throws Exception {
                     while (bytesptr < actualReadSize) {
                         bi = readArr[bytesptr++];// UNSAFE.getByte(segstart + bytesReading++);
                         if (!isNumber) {
-                            if (bi >= 192) {
-                                hascode = (hascode << 5) + hascode ^ bi;
-                            }
-                            else if (bi == 59) {
-                                isNumber = true;
-                                stationName = new modifiedbytearray(readArr, bbstart, bytesptr - 2, hascode & 0xFFFFFFFF);
-                                bbstart = 0;
-                                hascode = 5381;
-                                if (bytesptr >= readSize) {
-                                    break;
-                                }
-                            }
-                            else {
+                            while (bi != 59) {
                                 hascode = (hascode << 5) + hascode ^ bi;
+                                bi = readArr[bytesptr++];
                             }
+                            isNumber = true;
+                            stationName = new modifiedbytearray(readArr, bbstart, bytesptr - 2, hascode & 0xFFFFFFFF);
+                            bbstart = 0;
+                            hascode = 5381;
                         }
                         else {
-                            switch (bi) {
-                                case 0x2E:
-                                    break;
-                                case 0x2D:
+                            while (bi != 10) {
+                                if (bi == 0x2D) {
                                     sign = -1;
-                                    break;
-                                case 10:
-                                    hascode = 5381;
-                                    isNumber = false;
-                                    bbstart = bytesptr;
-                                    MeasurementAggregator agg = resultmap.get(stationName);
-                                    num *= sign;
-                                    if (agg == null) {
-                                        agg = new MeasurementAggregator();
-                                        agg.min = num;
-                                        agg.max = num;
-                                        agg.sum = (long) (num);
-                                        agg.count = 1;
-                                        resultmap.put(stationName, agg);
-                                    }
-                                    else {
-                                        if (agg.min >= num) {
-                                            agg.min = num;
-                                        }
-                                        if (agg.max <= num) {
-                                            agg.max = num;
-                                        }
-                                        agg.sum += (long) (num);
-                                        agg.count++;
-                                    }
-                                    num = 0;
-                                    sign = 1;
-                                    break;
-                                default:
+                                }
+                                else if (bi != 0x2E) {
                                     num = num * 10 + (bi - 0x30);
+                                }
+                                bi = readArr[bytesptr++];
+                            }
+                            hascode = 5381;
+                            isNumber = false;
+                            bbstart = bytesptr;
+                            num *= sign;
+                            MeasurementAggregator agg = resultmap.get(stationName);
+                            if (agg == null) {
+                                agg = new MeasurementAggregator();
+                                agg.min = num;
+                                agg.max = num;
+                                agg.sum = (long) (num);
+                                agg.count = 1;
+                                resultmap.put(stationName, agg);
+                            }
+                            else {
+                                if (agg.min >= num) {
+                                    agg.min = num;
+                                }
+                                if (agg.max <= num) {
+                                    agg.max = num;
+                                }
+                                agg.sum += (long) (num);
+                                agg.count++;
+                            }
+                            num = 0;
+                            sign = 1;
+                            if (bytesptr >= readSize) {
+                                break;
                             }
                         }
                     }
                     bytesRemaining -= bytesptr;
                     segstart += bytesptr;
                 }
+                // System.out.println("end:" + System.nanoTime() / 1000000);
                 /*
                  * while (bytesReading < (i[1] - i[0] + 1) && buffer.position() < buffer.limit()) {
                  * buffer.clear();

From 337642d1ec0a31c0eb4d410308942d9b583c7916 Mon Sep 17 00:00:00 2001
From: Mathias Bjerke <mathbje@gmail.com>
Date: Tue, 23 Jan 2024 20:28:58 +0100
Subject: [PATCH 118/268] 1brc contribution from mattiz (first attempt) (#567)

* Contribution from mattiz

* Formatted code
---
 calculate_average_mattiz.sh                   |  19 +
 .../onebrc/CalculateAverage_mattiz.java       | 324 ++++++++++++++++++
 2 files changed, 343 insertions(+)
 create mode 100755 calculate_average_mattiz.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_mattiz.java

diff --git a/calculate_average_mattiz.sh b/calculate_average_mattiz.sh
new file mode 100755
index 000000000..2432b7f4a
--- /dev/null
+++ b/calculate_average_mattiz.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_mattiz
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_mattiz.java b/src/main/java/dev/morling/onebrc/CalculateAverage_mattiz.java
new file mode 100644
index 000000000..52c31ba39
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_mattiz.java
@@ -0,0 +1,324 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
+
+public class CalculateAverage_mattiz {
+    private static final int TWO_BYTE_TO_INT = 480 + 48; // 48 is the ASCII code for '0'
+    private static final int THREE_BYTE_TO_INT = 4800 + 480 + 48;
+    private static final String FILE = "./measurements.txt";
+    public static final int PARTS = 8;
+
+    public static void main(String[] args) throws Exception {
+        var result = new CalculateAverage_mattiz().calculate(FILE, PARTS);
+        System.out.println(result);
+    }
+
+    StationList calculate(String file, int numParts) throws Exception {
+        var buffers = createBuffers(Paths.get(file), numParts);
+
+        return buffers
+                .parallelStream()
+                .map(this::aggregate)
+                .reduce(StationList::merge)
+                .orElseThrow();
+    }
+
+    record BufferAndSize(ByteBuffer buffer, long size) {
+    }
+
+    List<ByteBuffer> createBuffers(Path file, int numParts) throws IOException {
+        FileChannel fileChannel = FileChannel.open(file, StandardOpenOption.READ);
+
+        var fileSize = fileChannel.size();
+
+        if (fileSize < (1024 * 1024)) { // Only one core for small files
+            numParts = 1;
+        }
+
+        var chunkSize = fileSize / numParts;
+        var buffers = new ArrayList<ByteBuffer>();
+        long filePointer = 0;
+
+        for (int i = 0; i < numParts; i++) {
+            if (i != numParts - 1) { // not last element
+                var adjustedChunkSize = getBuffer(fileChannel, filePointer, chunkSize, true);
+                buffers.add(adjustedChunkSize.buffer());
+                filePointer += adjustedChunkSize.size();
+            }
+            else {
+                var adjustedChunkSize = getBuffer(fileChannel, filePointer, fileSize - filePointer, false);
+                buffers.add(adjustedChunkSize.buffer());
+            }
+        }
+
+        return buffers;
+    }
+
+    BufferAndSize getBuffer(FileChannel fileChannel, long start, long size, boolean adjust) throws IOException {
+        MappedByteBuffer buffer = fileChannel.map(READ_ONLY, start, size);
+
+        var actualSize = ((int) size);
+
+        if (adjust) {
+            while (buffer.get(actualSize - 1) != '\n') {
+                actualSize--;
+            }
+        }
+
+        buffer.limit(actualSize);
+
+        return new BufferAndSize(buffer, actualSize);
+    }
+
+    private StationList aggregate(ByteBuffer buffer) {
+        var measurements = new StationList();
+
+        while (buffer.hasRemaining()) {
+            int startPos = buffer.position();
+
+            byte b;
+            int hash = 0;
+            while ((b = buffer.get()) != ';') {
+                hash = ((hash << 5) - hash) + b;
+            }
+
+            if (hash < 0) {
+                hash = -hash;
+            }
+
+            int length = buffer.position() - startPos - 1;
+            byte[] station = new byte[length];
+            buffer.get(startPos, station);
+
+            int value = readValue(buffer);
+
+            measurements.update(station, length, hash, value);
+        }
+
+        return measurements;
+    }
+
+    /*
+     * Read decimal number from ascii characters (copied from arjenw)
+     *
+     * Example:
+     * If you have the decimal number 1.4,
+     * then byte 1 contain 49 (ascii code for '1')
+     * and byte 3 contain 52 (ascii code for '4')
+     * Subtract 480 + 48 (48 is the ASCII code for '0')
+     * to move number from ascii number to int
+     *
+     * 49 * 10 + 52 - 528 = 14
+     */
+    private static int readValue(ByteBuffer buffer) {
+        int value;
+        byte b1 = buffer.get();
+        byte b2 = buffer.get();
+        byte b3 = buffer.get();
+        byte b4 = buffer.get();
+
+        if (b2 == '.') {// value is n.n
+            value = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
+        }
+        else {
+            if (b4 == '.') { // value is -nn.n
+                value = -(b2 * 100 + b3 * 10 + buffer.get() - THREE_BYTE_TO_INT);
+            }
+            else if (b1 == '-') { // value is -n.n
+                value = -(b2 * 10 + b4 - TWO_BYTE_TO_INT);
+            }
+            else { // value is nn.n
+                value = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
+            }
+            buffer.get(); // new line
+        }
+        return value;
+    }
+}
+
+class CustomMap {
+    private static final int SIZE = 1024 * 64;
+    private final Station[] stationList = new Station[SIZE];
+
+    public void addOrUpdate(byte[] stationName, int length, int hash, int value) {
+        int slot = hash & (SIZE - 1);
+        var station = stationList[slot];
+
+        while (station != null
+                && station.getHash() != hash
+                && !Arrays.equals(
+                        station.getName(), 0, station.getName().length,
+                        stationName, 0, length)) {
+
+            slot = (slot + 1) & (SIZE - 1);
+            station = stationList[slot];
+        }
+
+        if (station == null) {
+            stationList[slot] = new Station(stationName, hash);
+        }
+
+        stationList[slot].add(value);
+    }
+
+    public Station get(byte[] stationName) {
+        return stationList[findSlot(stationName)];
+    }
+
+    public void put(byte[] stationName, Station newStation) {
+        stationList[findSlot(stationName)] = newStation;
+    }
+
+    private int findSlot(byte[] stationName) {
+        int hash = getHash(stationName);
+        int slot = hash & (SIZE - 1);
+        var station = stationList[slot];
+
+        while (station != null
+                && station.getHash() != hash
+                && !Arrays.equals(station.getName(), stationName)) {
+
+            slot = (slot + 1) & (SIZE - 1);
+            station = stationList[slot];
+        }
+
+        return slot;
+    }
+
+    private int getHash(byte[] key) {
+        int hash = 0;
+
+        for (byte b : key) {
+            hash = hash * 31 + b;
+        }
+
+        if (hash < 0) {
+            hash = -hash;
+        }
+
+        return hash;
+    }
+
+    public Set<Map.Entry<byte[], Station>> entrySet() {
+        var sorted = new HashMap<byte[], Station>();
+
+        for (var s : stationList) {
+            if (s != null) {
+                sorted.put(s.getName(), s);
+            }
+        }
+
+        return sorted.entrySet();
+    }
+
+    public Map<String, Station> sorted() {
+        var sorted = new TreeMap<String, Station>();
+
+        for (var s : stationList) {
+            if (s != null) {
+                sorted.put(new String(s.getName(), StandardCharsets.UTF_8), s);
+            }
+        }
+
+        return sorted;
+    }
+}
+
+class StationList {
+    private final CustomMap stations = new CustomMap();
+
+    public void update(byte[] stationName, int length, int hash, int value) {
+        stations.addOrUpdate(stationName, length, hash, value);
+    }
+
+    public StationList merge(StationList other) {
+        for (var aggregator : other.stations.entrySet()) {
+            var agg = stations.get(aggregator.getKey());
+
+            if (agg == null) {
+                stations.put(aggregator.getKey(), aggregator.getValue());
+            }
+            else {
+                agg.merge(aggregator.getValue());
+            }
+        }
+
+        return this;
+    }
+
+    @Override
+    public String toString() {
+        return stations.sorted().toString();
+    }
+}
+
+class Station {
+    private final byte[] name;
+    private final int hash;
+    private int min = Integer.MAX_VALUE;
+    private int max = Integer.MIN_VALUE;
+    private int sum;
+    private int count;
+
+    public Station(byte[] name, int hash) {
+        this.name = name;
+        this.hash = hash;
+    }
+
+    public void add(int max, int min, int sum, int count) {
+        this.max = Math.max(this.max, max);
+        this.min = Math.min(this.min, min);
+        this.sum += sum;
+        this.count += count;
+    }
+
+    public void add(int value) {
+        this.max = Math.max(this.max, value);
+        this.min = Math.min(this.min, value);
+        this.sum += value;
+        this.count++;
+    }
+
+    public void merge(Station other) {
+        this.max = Math.max(this.max, other.max);
+        this.min = Math.min(this.min, other.min);
+        this.sum += other.sum;
+        this.count += other.count;
+    }
+
+    public String toString() {
+        return (min / 10.0) + "/" + (Math.round(((double) sum) / count)) / 10.0 + "/" + (max / 10.0);
+    }
+
+    public byte[] getName() {
+        return name;
+    }
+
+    public int getHash() {
+        return hash;
+    }
+}
\ No newline at end of file

From a9a05599cd2e3150bc66b6c6171ac8d106ee3e17 Mon Sep 17 00:00:00 2001
From: 3j5a <105244096+3j5a@users.noreply.github.com>
Date: Tue, 23 Jan 2024 21:31:45 +0200
Subject: [PATCH 119/268] CalculateAverage_3j5a off-the-shelf Java components +
 ArraysSupport (#566)

* off the shell Java components, curious about official runtime results. thnx

my laptop results are around 12 seconds, e.g:
87.66user 1.32system 0:12.11elapsed 734%CPU (0avgtext+0avgdata 13980924maxresident)k

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         39 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  8
  On-line CPU(s) list:   0-7
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Core(TM) i5-8400H CPU @ 2.50GHz

* off-the-shelf Java components... curious about official runtime results. thnx

laptop results are around 11 seconds, e.g:
./calculate_average_3j5a.sh  81.46s user 1.36s system 758% cpu 10.917 total

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         39 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  8
  On-line CPU(s) list:   0-7
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Core(TM) i5-8400H CPU @ 2.50GHz

* off-the-shelf Java components + ArraysSupport..

laptop results are around 10.2 seconds, e.g:
./calculate_average_3j5a.sh  75.02s user 1.31s system 750% cpu 10.175 total

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         39 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  8
  On-line CPU(s) list:   0-7
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Core(TM) i5-8400H CPU @ 2.50GHz

* method handle...

* full buffer read attempt

* MH

* MH cleanup
---
 calculate_average_3j5a.sh                     |  19 ++
 prepare_3j5a.sh                               |  20 ++
 .../morling/onebrc/CalculateAverage_3j5a.java | 277 ++++++++++++++++++
 3 files changed, 316 insertions(+)
 create mode 100755 calculate_average_3j5a.sh
 create mode 100755 prepare_3j5a.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java

diff --git a/calculate_average_3j5a.sh b/calculate_average_3j5a.sh
new file mode 100755
index 000000000..b4a427732
--- /dev/null
+++ b/calculate_average_3j5a.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--add-opens=java.base/jdk.internal.util=ALL-UNNAMED"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_3j5a
diff --git a/prepare_3j5a.sh b/prepare_3j5a.sh
new file mode 100755
index 000000000..06b81c4dd
--- /dev/null
+++ b/prepare_3j5a.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java b/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java
new file mode 100644
index 000000000..178cfacee
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java
@@ -0,0 +1,277 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.invoke.MethodHandle;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import static java.lang.Class.forName;
+import static java.lang.System.out;
+import static java.lang.invoke.MethodHandles.lookup;
+import static java.util.Comparator.comparing;
+
+public class CalculateAverage_3j5a {
+
+    private static final String FILE = "./measurements.txt";
+
+    public static void main(String[] args) throws IOException {
+        try (RandomAccessFile measurementsFile = new RandomAccessFile(FILE, "r")) {
+            var slices = slice(measurementsFile);
+            var measurementsChannel = measurementsFile.getChannel();
+            slices.stream().parallel().map(slice -> {
+                MappedByteBuffer measurementsSlice = map(slice, measurementsChannel);
+                var measurementBuffer = new byte[rules.maxMeasurementLength];
+                var measurements = HashMap.<Station, StationMeasurementStatistics> newHashMap(rules.uniqueStationsCount);
+                while (measurementsSlice.hasRemaining()) {
+                    var a = nextStationMeasurement(measurementBuffer, measurementsSlice);
+                    var stats = measurements.get(a.station);
+                    if (stats == null) {
+                        a.station.detachFromMeasurementBuffer();
+                        stats = new StationMeasurementStatistics(a);
+                        measurements.put(a.station, stats);
+                    }
+                    else {
+                        stats.add(a);
+                    }
+                }
+                return measurements;
+            }).reduce((aslice, bslice) -> {
+                aslice.forEach((astation, astats) -> {
+                    var bstats = bslice.putIfAbsent(astation, astats);
+                    if (bstats != null) {
+                        bstats.merge(astats);
+                    }
+                });
+                return bslice;
+            }).ifPresent(measurements -> {
+                var results = new StringBuilder(measurements.size() * (rules.maxStationNameLength + rules.maxStationStatisticsOutputLength));
+                measurements.values().stream()
+                        .sorted(comparing(StationMeasurementStatistics::getName))
+                        .forEach(stationStats -> results.append(stationStats).append(", "));
+                out.println("{" + results.substring(0, results.length() - 2) + "}");
+            });
+        }
+    }
+
+    record Rules(int minMeasurementLength, int maxStationNameLength,
+                 int maxMeasurementLength, int maxStationStatisticsOutputLength,
+                 int uniqueStationsCount) {
+        Rules() {
+            this(5, 100, 106, 18, 10_000);
+        }
+    }
+
+    private static final Rules rules = new Rules();
+
+    record MeasurementsSlice(long start, long length) {
+    }
+
+    static class Station {
+
+        private byte[] name;
+        final int length;
+        private int hash;
+
+        private static final MethodHandle vectorizedHashCode;
+        private static final int T_BYTE = 8;
+
+        static {
+            try {
+                var arraysSupport = forName("jdk.internal.util.ArraysSupport");
+                Class<?>[] vectorizedHashCodeSignature = { Object.class, int.class, int.class, int.class, int.class };
+                var vectorizedHashCodeMethod = arraysSupport.getDeclaredMethod("vectorizedHashCode", vectorizedHashCodeSignature);
+                vectorizedHashCode = lookup().unreflect(vectorizedHashCodeMethod);
+            }
+            catch (NoSuchMethodException | IllegalAccessException | ClassNotFoundException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        Station(byte[] name, int length) {
+            this.name = name;
+            this.length = length;
+        }
+
+        public void detachFromMeasurementBuffer() {
+            var n = new byte[length];
+            System.arraycopy(name, 0, n, 0, length);
+            this.name = n;
+        }
+
+        @Override
+        public boolean equals(Object that) {
+            return Arrays.mismatch(this.name, 0, length, ((Station) that).name, 0, length) < 0;
+        }
+
+        @Override
+        public int hashCode() {
+            if (hash == 0) {
+                try {
+                    hash = (int) vectorizedHashCode.invokeExact((Object) name, 0, length, 1, T_BYTE);
+                }
+                catch (Throwable e) {
+                    throw new RuntimeException(e);
+                }
+            }
+            return hash;
+        }
+
+    }
+
+    record StationMeasurement(Station station, int temperature) {
+    }
+
+    private static class StationMeasurementStatistics {
+
+        private final byte[] bname;
+        private String name;
+        private int min;
+        private int max;
+        private long sum;
+        private int count = 1;
+
+        StationMeasurementStatistics(StationMeasurement stationMeasurement) {
+            this.bname = stationMeasurement.station.name;
+            this.min = stationMeasurement.temperature;
+            this.max = stationMeasurement.temperature;
+            this.sum = stationMeasurement.temperature;
+        }
+
+        public String getName() {
+            if (name == null) {
+                name = new String(bname, StandardCharsets.UTF_8);
+            }
+            return name;
+        }
+
+        void add(StationMeasurement measurement) {
+            var temperature = measurement.temperature;
+            update(1, temperature, temperature, temperature);
+        }
+
+        void merge(StationMeasurementStatistics other) {
+            update(other.count, other.min, other.max, other.sum);
+        }
+
+        private void update(int count, int min, int max, long sum) {
+            this.count += count;
+            if (this.min > min) {
+                this.min = min;
+            }
+            if (this.max < max) {
+                this.max = max;
+            }
+            this.sum += sum;
+        }
+
+        @Override
+        public String toString() {
+            var name = getName();
+            var min = this.min / 10f;
+            var mean = Math.round(this.sum / (float) this.count) / 10f;
+            var max = this.max / 10f;
+            return new StringBuilder(name.length() + rules.maxStationStatisticsOutputLength)
+                    .append(name).append("=").append(min).append("/").append(mean).append("/").append(max)
+                    .toString();
+        }
+    }
+
+    private static StationMeasurement nextStationMeasurement(byte[] measurement, MappedByteBuffer memoryMappedSlice) {
+        byte b;
+        int i = rules.minMeasurementLength;
+        memoryMappedSlice.get(measurement, 0, i);
+        while ((b = memoryMappedSlice.get()) != '\n') {
+            measurement[i] = b;
+            i++;
+        }
+        var zeroOffset = '0';
+        int temperature = measurement[--i] - zeroOffset;
+        i--; // skipping dot
+        var base = 10;
+        while ((b = measurement[--i]) != ';') {
+            if (b == '-') {
+                temperature = -temperature;
+            }
+            else {
+                temperature = base * (b - zeroOffset) + temperature;
+                base *= base;
+            }
+        }
+        return new StationMeasurement(new Station(measurement, i), temperature);
+    }
+
+    private static MappedByteBuffer map(MeasurementsSlice slice, FileChannel measurements) {
+        try {
+            return measurements.map(FileChannel.MapMode.READ_ONLY, slice.start, slice.length);
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static List<MeasurementsSlice> slice(RandomAccessFile measurements) throws IOException {
+        int chunks = Runtime.getRuntime().availableProcessors();
+        List<MeasurementsSlice> measurementSlices;
+        while ((measurementSlices = slice(measurements, chunks)) == null) {
+            chunks++;
+        }
+        return measurementSlices;
+    }
+
+    private static List<MeasurementsSlice> slice(RandomAccessFile measurements, int chunks) throws IOException {
+        long measurementsFileLength = measurements.length();
+        long chunkLength = 0;
+        long remainder;
+        if (chunks < measurementsFileLength) {
+            chunks--;
+            do {
+                chunkLength = measurementsFileLength / ++chunks;
+                remainder = measurementsFileLength % chunkLength;
+            } while (chunkLength + remainder > Integer.MAX_VALUE);
+        }
+        if (chunkLength <= rules.maxMeasurementLength) {
+            return List.of(new MeasurementsSlice(0, measurementsFileLength));
+        }
+        var measurementSlices = new ArrayList<MeasurementsSlice>(chunks);
+        var sliceStart = 0L;
+        for (int i = 0; i < chunks - 1; i++) {
+            var sliceLength = chunkLength;
+            measurements.seek(sliceStart + sliceLength);
+            while (measurements.readByte() != '\n') {
+                measurements.seek(sliceStart + ++sliceLength);
+            }
+            sliceLength++;
+            if (sliceLength > Integer.MAX_VALUE) {
+                return null;
+            }
+            measurementSlices.add(new MeasurementsSlice(sliceStart, sliceLength));
+            sliceStart = sliceStart + sliceLength;
+        }
+        var previousSlice = measurementSlices.getLast();
+        var lastSliceStart = previousSlice.start + previousSlice.length;
+        measurementSlices.addLast(new MeasurementsSlice(lastSliceStart, measurementsFileLength - lastSliceStart));
+        return measurementSlices;
+    }
+
+}

From 7ced63f46007eb0c68da359e38b6c465f2446e2b Mon Sep 17 00:00:00 2001
From: Roy van Rijn <roy.van.rijn@gmail.com>
Date: Tue, 23 Jan 2024 11:37:09 -0800
Subject: [PATCH 120/268] Rewrote to always read 16 bytes, this has less
 instructions on perf. (#562)

It doesn't make a lot of sense since quite some code can be written shorter, but this is what gives the best numbers.
---
 .../onebrc/CalculateAverage_royvanrijn.java   | 297 +++++++-----------
 1 file changed, 116 insertions(+), 181 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
index 68004565c..1cd70e42c 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
@@ -62,7 +62,10 @@
  * Unrolling scan-loop:              1200 ms (seems to help, perhaps even more on target machine)
  * Adding more readable reader:      1300 ms (scores got worse on target machine anyway)
  *
- * I've ditched my M2 for an older x86-64 MacBook, this allows me to run `perf` and I'm trying to get lower numbers by trail and error.
+ * Using old x86 MacBook and perf:   3500 ms (different scoring)
+ * Decided to rewrite loop for 16 b: 3050 ms
+ *
+ * I have some instructions that could be removed, but faster with...
  *
  * Big thanks to Francesco Nigro, Thomas Wuerthinger, Quan Anh Mai and many others for ideas.
  *
@@ -80,7 +83,7 @@ public class CalculateAverage_royvanrijn {
 
     /**
      * Flyweight entry in a byte[], max 128 bytes.
-     *
+     * <p>
      * long: sum
      * int:  min
      * int:  max
@@ -122,10 +125,12 @@ private static void spawnWorker() throws IOException {
     }
 
     public static void main(String[] args) throws Exception {
+
         if (args.length == 0 || !("--worker".equals(args[0]))) {
             spawnWorker();
             return;
         }
+
         // Calculate input segments.
         final FileChannel fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
         final long fileSize = fileChannel.size();
@@ -184,13 +189,15 @@ public static void main(String[] args) throws Exception {
         System.out.close(); // close the stream to stop
     }
 
-    private static byte[] fillEntry(final byte[] entry, final long fromAddress, final int length, final int temp) {
+    private static byte[] fillEntry(final byte[] entry, final long fromAddress, final int entryLength, final int temp, final long readBuffer1, final long readBuffer2) {
         UNSAFE.putLong(entry, ENTRY_SUM, temp);
         UNSAFE.putInt(entry, ENTRY_MIN, temp);
         UNSAFE.putInt(entry, ENTRY_MAX, temp);
         UNSAFE.putInt(entry, ENTRY_COUNT, 1);
-        UNSAFE.putByte(entry, ENTRY_LENGTH, (byte) length);
-        UNSAFE.copyMemory(null, fromAddress, entry, ENTRY_NAME, length);
+        UNSAFE.putByte(entry, ENTRY_LENGTH, (byte) entryLength);
+        UNSAFE.copyMemory(null, fromAddress, entry, ENTRY_NAME, entryLength - 16);
+        UNSAFE.putLong(entry, ENTRY_NAME + entryLength - 16, readBuffer1);
+        UNSAFE.putLong(entry, ENTRY_NAME + entryLength - 8, readBuffer2);
         return entry;
     }
 
@@ -219,16 +226,16 @@ public static byte[] mergeEntry(final byte[] entry, final byte[] merge) {
         int count = UNSAFE.getInt(merge, ENTRY_COUNT);
 
         sum += UNSAFE.getLong(entry, ENTRY_SUM);
-        int entryMin = UNSAFE.getInt(entry, ENTRY_MIN);
-        int entryMax = UNSAFE.getInt(entry, ENTRY_MAX);
         count += UNSAFE.getInt(entry, ENTRY_COUNT);
 
+        int entryMin = UNSAFE.getInt(entry, ENTRY_MIN);
+        int entryMax = UNSAFE.getInt(entry, ENTRY_MAX);
         entryMin = Math.min(entryMin, mergeMin);
         entryMax = Math.max(entryMax, mergeMax);
-
-        UNSAFE.putLong(entry, ENTRY_SUM, sum);
         UNSAFE.putInt(entry, ENTRY_MIN, entryMin);
         UNSAFE.putInt(entry, ENTRY_MAX, entryMax);
+
+        UNSAFE.putLong(entry, ENTRY_SUM, sum);
         UNSAFE.putInt(entry, ENTRY_COUNT, count);
         return entry;
     }
@@ -241,16 +248,16 @@ private static String entryToName(final byte[] entry) {
         UNSAFE.copyMemory(entry, ENTRY_NAME, name, Unsafe.ARRAY_BYTE_BASE_OFFSET, length);
 
         // Create a new String with the existing byte[]:
-        return new String(name, StandardCharsets.UTF_8);
+        return new String(name, StandardCharsets.UTF_8).trim();
     }
 
     private static String entryValuesToString(final byte[] entry) {
-        return round(UNSAFE.getInt(entry, ENTRY_MIN))
+        return (round(UNSAFE.getInt(entry, ENTRY_MIN))
                 + "/" +
                 round((1.0 * UNSAFE.getLong(entry, ENTRY_SUM)) /
                         UNSAFE.getInt(entry, ENTRY_COUNT))
                 + "/" +
-                round(UNSAFE.getInt(entry, ENTRY_MAX));
+                round(UNSAFE.getInt(entry, ENTRY_MAX)));
     }
 
     // Print a piece of memory:
@@ -280,13 +287,12 @@ private static double round(final double value) {
     private static final class Reader {
 
         private long ptr;
-        private long delimiterMask;
-        private long lastRead;
-        private long lastReadMinOne;
+        private long readBuffer1;
+        private long readBuffer2;
 
         private long hash;
         private long entryStart;
-        private long entryDelimiter;
+        private int entryLength; // in bytes rounded to nearest 16
 
         private final long endAddress;
 
@@ -309,6 +315,7 @@ private static final class Reader {
         private void processStart() {
             hash = 0;
             entryStart = ptr;
+            entryLength = 0;
         }
 
         private boolean hasNext() {
@@ -317,64 +324,77 @@ private boolean hasNext() {
 
         private static final long DELIMITER_MASK = 0x3B3B3B3B3B3B3B3BL;
 
-        private boolean readFirst() {
-            lastRead = UNSAFE.getLong(ptr);
+        private boolean readNext() {
 
-            final long match = lastRead ^ DELIMITER_MASK;
-            delimiterMask = (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
+            readBuffer1 = UNSAFE.getLong(ptr);
+            readBuffer2 = UNSAFE.getLong(ptr + 8);
 
-            return delimiterMask == 0;
-        }
+            entryLength += 16;
 
-        private boolean readNext() {
-            lastReadMinOne = lastRead;
-            return readFirst();
-        }
+            // Find delimiter and create mask for long1
+            long comparisonResult1 = (readBuffer1 ^ DELIMITER_MASK);
+            long highBitMask1 = (comparisonResult1 - 0x0101010101010101L) & (~comparisonResult1 & 0x8080808080808080L);
 
-        private void processName() {
-            hash ^= lastRead;
-            ptr += 8;
-        }
+            boolean noContent1 = highBitMask1 == 0;
+            long mask1 = noContent1 ? 0 : ~((highBitMask1 >>> 7) - 1);
+            int position1 = noContent1 ? -1 : Long.numberOfTrailingZeros(highBitMask1) >> 3;
 
-        private int processEndAndGetTemperature() {
-            processFinalBytes();
+            readBuffer1 &= ~mask1;
+            hash ^= readBuffer1;
 
-            finalizeHash();
-            finalizeDelimiter();
+            if (position1 != -1) {
+                hash ^= hash >> 32;
+                readBuffer2 = 0;
+                ptr += position1 + 1;
+                return false;
+            }
 
-            return readTemperature();
+            // Repeat for long2
+            long comparisonResult2 = (readBuffer2 ^ DELIMITER_MASK);
+            long highBitMask2 = (comparisonResult2 - 0x0101010101010101L) & (~comparisonResult2 & 0x8080808080808080L);
+            boolean noContent2 = highBitMask2 == 0;
+            long mask2 = noContent2 ? -1 : ((highBitMask2 >>> 7) - 1);
+            int position2 = noContent2 ? -1 : Long.numberOfTrailingZeros(highBitMask2) >> 3;
+
+            mask2 = ~mask2; // also not necessary, but faster with?
+            // Apply masks
+            readBuffer2 &= ~mask2;
+            hash ^= readBuffer2;
+
+            int delimiter = position2 == -1 ? -1 : position2 + 8; // not nnecessary, but faster?
+
+            hash ^= hash >> 32;
+
+            if (delimiter == -1) {
+                ptr += 16;
+                return true;
+            }
+            ptr += delimiter + 1;
+            return false;
         }
 
-        private void processFinalBytes() {
-            // Shift and read the last bytes:
-            lastRead &= ((delimiterMask >>> 7) - 1);
+        private int processEndAndGetTemperature() {
+            finalizeHash();
+            return readTemperature();
         }
 
         private void finalizeHash() {
-            // Finalize hash:
-            hash ^= lastRead;
-            hash ^= hash >> 32;
             hash ^= hash >> 17; // extra entropy
         }
 
-        private void finalizeDelimiter() {
-            // Found delimiter:
-            entryDelimiter = ptr + (Long.numberOfTrailingZeros(delimiterMask) >> 3);
-        }
-
         private static final long DOT_BITS = 0x10101000;
         private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
 
         // Awesome idea of merykitty:
         private int readTemperature() {
             // This is the number part: X.X, -X.X, XX.x or -XX.X
-            long numberBytes = UNSAFE.getLong(entryDelimiter + 1);
+            long numberBytes = UNSAFE.getLong(ptr);
             long invNumberBytes = ~numberBytes;
 
             int dotPosition = Long.numberOfTrailingZeros(invNumberBytes & DOT_BITS);
 
             // Update the pointer here, bit awkward, but we have all the data
-            ptr = entryDelimiter + (dotPosition >> 3) + 4;
+            ptr += (dotPosition >> 3) + 3;
 
             int min28 = (28 - dotPosition);
             // Calculates the sign
@@ -388,57 +408,32 @@ private int readTemperature() {
             return (int) ((absValue + signed) ^ signed); // non-patented method of doing the same trick
         }
 
-        private boolean matchesEntryFull(final byte[] entry) {
-            int longs = (int) (entryDelimiter - entryStart) >> 3;
+        private boolean matches(final byte[] entry) {
             int step = 0;
-            for (int i = 0; i < longs - 2; i++) {
-                if (UNSAFE.getLong(entryStart + step) != UNSAFE.getLong(entry, ENTRY_NAME + step)) {
+            for (; step < entryLength - 16;) {
+                if (compare(null, entryStart + step, entry, ENTRY_NAME + step)) {
+                    return false;
+                }
+                step += 8;
+                if (compare(null, entryStart + step, entry, ENTRY_NAME + step)) {
                     return false;
                 }
                 step += 8;
             }
-            if (lastReadMinOne != UNSAFE.getLong(entry, (ENTRY_NAME_8) + step)) {
-                return false;
-            }
-            if (lastRead != UNSAFE.getLong(entry, (ENTRY_NAME_16) + step)) {
-                return false;
-            }
-            return true;
-
-        }
-
-        private boolean matchesEntryMedium(final byte[] entry) {
-            if (UNSAFE.getLong(entryStart) != UNSAFE.getLong(entry, ENTRY_NAME)) {
-                return false;
-            }
-            if (lastReadMinOne != UNSAFE.getLong(entry, ENTRY_NAME_8)) {
-                return false;
-            }
-            if (lastRead != UNSAFE.getLong(entry, ENTRY_NAME_16)) {
-                return false;
-            }
-            return true;
-        }
-
-        private boolean matchesEntryShort(final byte[] entry) {
-            if (lastReadMinOne != UNSAFE.getLong(entry, ENTRY_NAME)) {
+            if (compare(readBuffer1, entry, ENTRY_NAME + step)) {
                 return false;
             }
-            if (lastRead != UNSAFE.getLong(entry, ENTRY_NAME_8)) {
+            step += 8;
+            if (compare(readBuffer2, entry, ENTRY_NAME + step)) {
                 return false;
             }
             return true;
         }
 
-        private boolean matchesEnding(final byte[] entry) {
-            return lastRead == UNSAFE.getLong(entry, ENTRY_NAME);
-        }
-
-        private int length() {
-            return (int) (entryDelimiter - entryStart);
-
+        private boolean matches16(final byte[] entry) {
+            return !compare(readBuffer1, entry, ENTRY_NAME) &&
+                    !compare(readBuffer2, entry, ENTRY_NAME + 8);
         }
-
     }
 
     private static byte[][] processMemoryArea(final long startAddress, final long endAddress, boolean isFileStart) {
@@ -456,8 +451,8 @@ private static byte[][] processMemoryArea(final long startAddress, final long en
 
             reader.processStart();
 
-            if (!reader.readFirst()) {
-                // Found delimiter in first 8 bytes:
+            if (!reader.readNext()) {
+                // First 16 bytes:
 
                 int temperature = reader.processEndAndGetTemperature();
 
@@ -466,13 +461,12 @@ private static byte[][] processMemoryArea(final long startAddress, final long en
                 while (true) {
                     entry = table[index];
                     if (entry == null) {
-                        int length = reader.length();
                         byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
-                                : new byte[ENTRY_BASESIZE_WHITESPACE + length];
-                        table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
+                                : new byte[ENTRY_BASESIZE_WHITESPACE + 16]; // with enough room
+                        table[index] = fillEntry(entryBytes, reader.entryStart, 16, temperature, reader.readBuffer1, reader.readBuffer2);
                         break;
                     }
-                    else if (reader.matchesEnding(entry)) {
+                    else if (reader.matches16(entry)) {
                         updateEntry(entry, temperature);
                         break;
                     }
@@ -481,104 +475,45 @@ else if (reader.matchesEnding(entry)) {
                         index = (index + 1) & TABLE_MASK;
                     }
                 }
+                continue;
             }
-            else {
-                reader.processName();
-
-                if (!reader.readNext()) {
-                    // Found delimiter in 8-16 bytes:
-
-                    int temperature = reader.processEndAndGetTemperature();
-
-                    // Find or insert the entry:
-                    int index = (int) (reader.hash & TABLE_MASK);
-                    while (true) {
-                        entry = table[index];
-                        if (entry == null) {
-                            int length = reader.length();
-                            byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
-                                    : new byte[ENTRY_BASESIZE_WHITESPACE + length];
-                            table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
-                            break;
-                        }
-                        else if (reader.matchesEntryShort(entry)) {
-                            updateEntry(entry, temperature);
-                            break;
-                        }
-                        else {
-                            // Move to the next index
-                            index = (index + 1) & TABLE_MASK;
-                        }
-                    }
+            while (reader.readNext())
+                ;
+
+            int temperature = reader.processEndAndGetTemperature();
+
+            // Find or insert the entry:
+            int index = (int) (reader.hash & TABLE_MASK);
+            while (true) {
+                entry = table[index];
+                if (entry == null) {
+                    int length = reader.entryLength;
+                    byte[] entryBytes = (length < PREMADE_MAX_SIZE && entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
+                            : new byte[ENTRY_BASESIZE_WHITESPACE + length]; // with enough room
+                    table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature, reader.readBuffer1, reader.readBuffer2);
+                    break;
+                }
+                else if (reader.matches(entry)) {
+                    updateEntry(entry, temperature);
+                    break;
                 }
                 else {
-                    reader.processName();
-
-                    if (!reader.readNext()) {
-                        // Found delimiter in 16-24 bytes:
-
-                        int temperature = reader.processEndAndGetTemperature();
-
-                        // Find or insert the entry:
-                        int index = (int) (reader.hash & TABLE_MASK);
-                        while (true) {
-                            entry = table[index];
-                            if (entry == null) {
-                                int length = reader.length();
-                                byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
-                                        : new byte[ENTRY_BASESIZE_WHITESPACE + length];
-                                table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
-                                break;
-                            }
-                            else if (reader.matchesEntryMedium(entry)) {
-                                updateEntry(entry, temperature);
-                                break;
-                            }
-                            else {
-                                // Move to the next index
-                                index = (index + 1) & TABLE_MASK;
-                            }
-                        }
-
-                    }
-                    else {
-                        // Need more than 24 bytes:
-
-                        reader.processName();
-                        while (reader.readNext()) {
-                            reader.processName();
-                        }
-
-                        int temperature = reader.processEndAndGetTemperature();
-
-                        // Find or insert the entry:
-                        int index = (int) (reader.hash & TABLE_MASK);
-                        while (true) {
-                            entry = table[index];
-                            if (entry == null) {
-                                int length = reader.length();
-                                byte[] entryBytes = (length < PREMADE_MAX_SIZE && entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
-                                        : new byte[ENTRY_BASESIZE_WHITESPACE + length]; // with enough room
-                                table[index] = fillEntry(entryBytes, reader.entryStart, length, temperature);
-                                break;
-                            }
-                            else if (reader.matchesEntryFull(entry)) {
-                                updateEntry(entry, temperature);
-                                break;
-                            }
-                            else {
-                                // Move to the next index
-                                index = (index + 1) & TABLE_MASK;
-                            }
-                        }
-                    }
+                    // Move to the next index
+                    index = (index + 1) & TABLE_MASK;
                 }
             }
-
         }
         return table;
     }
 
+    private static boolean compare(final Object object1, final long address1, final Object object2, final long address2) {
+        return UNSAFE.getLong(object1, address1) != UNSAFE.getLong(object2, address2);
+    }
+
+    private static boolean compare(final long value1, final Object object2, final long address2) {
+        return value1 != UNSAFE.getLong(object2, address2);
+    }
+
     /*
      * `___` ___ ___ _ ___` ` ___ ` _ ` _ ` _` ___
      * / ` \| _ \ __| \| \ \ / /_\ | | | | | | __|

From 464ba6209bb229f45e43693801ec5814d8815760 Mon Sep 17 00:00:00 2001
From: Laake Scates-Gervasi <justplainlaake@users.noreply.github.com>
Date: Tue, 23 Jan 2024 11:44:57 -0800
Subject: [PATCH 121/268] Laake Scates-Gervasi first submission
 (justplainlaake) [2.5s execution, locally similar time to top 5] (#431)

* Init Push

* organize imports

* Add OpenMap

* Best outcome yet

* Create prepare script and calculate script for native image, also add comments on calculation

* Remove extra hashing, and need for the set array

* Commit formatting changes from build

* Remove unneeded device information

* Make shell scripts executable, add hash collision double check for equality

* Add hash collision double check for equality

* Skip multithreading for small files to improve small file performance
---
 calculate_average_justplainlaake.sh           |  23 +
 prepare_justplainlaake.sh                     |  30 ++
 .../CalculateAverage_justplainlaake.java      | 459 ++++++++++++++++++
 3 files changed, 512 insertions(+)
 create mode 100755 calculate_average_justplainlaake.sh
 create mode 100755 prepare_justplainlaake.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java

diff --git a/calculate_average_justplainlaake.sh b/calculate_average_justplainlaake.sh
new file mode 100755
index 000000000..2c0341f54
--- /dev/null
+++ b/calculate_average_justplainlaake.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+if [ -f target/CalculateAverage_justplainlaake_image ]; then #if there is a native image, then lets run it. Else fallback to standard java execution
+    target/CalculateAverage_justplainlaake_image
+else
+    java -XX:+UseG1GC --enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_justplainlaake
+fi
+
diff --git a/prepare_justplainlaake.sh b/prepare_justplainlaake.sh
new file mode 100755
index 000000000..bc7c6dce9
--- /dev/null
+++ b/prepare_justplainlaake.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
+
+if [ ! -f target/CalculateAverage_justplainlaake_image ]; then
+    #disable assertions
+    #optimize code for best performance
+    #native march gives best performance for machine image is built on
+    #strict image heap allows all classes ot be used at build time
+    #native image info prints the trace of the build
+    #enable preview allows for preview features of current release
+    #epsilon garbage collector is a gc that doesn't gc... haha
+    native-image -dsa -O3 -march=native --strict-image-heap --native-image-info --enable-preview --gc=epsilon -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_justplainlaake_image dev.morling.onebrc.CalculateAverage_justplainlaake
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java b/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java
new file mode 100644
index 000000000..505ec3f53
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java
@@ -0,0 +1,459 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import sun.misc.Unsafe;
+
+/*
+    Possibilities to improve:
+        * Reduce Standard Memory Reads and/or Swaps for threading 
+            - For the read file; using Unsafe or MemorySegment to map the file to an existing register instead of keeping the bytes local
+            - For normal variables; Most of the time reading a value performs a load from memory and registers it for faster lookups, but with multithreading causes each thread to re read and register each get [volatile] keyword
+        * Add multithreading to process multiple segments at once (When you have 1,000,000,000 cars driving might as well open as many lanes as possible)
+        * Improve Mapping of entries (More O(1) lookups the better, i.e. hashed key maps, preferebly open maps to skip needing linked lists or trees, also simplifies since we don't need to delete anything)
+        * Remove use of java streams (They can be much slower than expected, good for developer readability but not for performance 90% of the time)
+        * Reduce amount of bytecode instructions (Usually just a micro-optimization, but since we are reading 1,000,000,000 lines, then this is really helpful in the processing code)
+        * Never use division in processing code, division is 2x+ slower than multiplication (Easy fix is multiplying by decimal 2/2 vs 2*0.5)
+
+    My System:
+        Device:
+            Processor(16)	11th Gen Intel(R) Core(TM) i7-11700K @ 3.60GHz   3.60 GHz
+            Installed RAM	32.0 GB (31.8 GB usable)
+            System type	64-bit operating system, x64-based processor
+            Pen and touch	No pen or touch input is available for this display
+        Windows Specification:
+            Edition	Windows 11 Home
+            Version	23H2
+            OS build	22635.3061
+            Experience	Windows Feature Experience Pack 1000.22684.1000.0
+
+
+    Runs (Only IDE open, just after complete shutdown, measured using System.nanoTime around main method):
+        - Baseline
+            * 144,403.3814ms
+        - merrykittyunsafe (#1 on LB)
+            * 2,757.8295ms
+        - royvanrijn (#2 on LB)
+            * 1,643.9123ms ??? Assuming this is because of my system specs compared to specs on testing system
+        //Obviously there were more runs than this, but these were the significant jumps
+        - Me run 1 (Initial attempt;multithreading, file mapped to global Unsafe, long hash of name, read byte by byte, store in hashmap and merge from threads)
+            * 5,423.4432ms
+        - Me run 2 (Read longs instead of bytes to determine name hash)
+            * 3,937.3234ms
+        - Me run 3 (Swap to using a rolling long hash with murmur3 hashing function, change hashmap to be an openmap with unboxed long as the key)
+            * 2,951.6891ms
+        - Me run 4 (Change entire line reading to be long based with bit operations to determine number)
+            * 2,684.9823ms
+        - Me run 5 (Use main thread as one of the processing threads)
+            * 2,307.3038ms
+        - Me run 6 (Remove use of math.min and math.max in favor of ternary operator (Reduces getStatic operation))
+            * 2,265.3521ms
+ */
+
+public class CalculateAverage_justplainlaake {
+
+    // Constants
+    private static final String FILE = "./measurements.txt";
+    private static final byte SEPERATOR_BYTE = ';';
+    private static final byte NEW_LINE_BYTE = '\n';
+    private static final DecimalFormat STATION_FORMAT = new DecimalFormat("#,##0.0");
+
+    private static final long[] OFFSET_CLEARS = {
+            0x0000000000000000L, // 8 Offset (Clear whole thing)
+            0x00000000000000FFL,
+            0x000000000000FFFFL,
+            0x0000000000FFFFFFL,
+            0x00000000FFFFFFFFL,
+            0x000000FFFFFFFFFFL,
+            0x0000FFFFFFFFFFFFL,
+            0x00FFFFFFFFFFFFFFL,
+            0xFFFFFFFFFFFFFFFFL,// 0 Offset (Clear nothing)
+    };
+
+    private static final Unsafe UNSAFE;
+    static {
+        Unsafe _unsafe = null;
+        try {
+            Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            unsafe.setAccessible(true);
+            _unsafe = (Unsafe) unsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | SecurityException | IllegalArgumentException | IllegalAccessException e) {
+            e.printStackTrace();
+            System.exit(1);
+        }
+        UNSAFE = _unsafe;// Just to get around "The blank final field UNSAFE may not have been initialized"
+    }
+
+    public static void main(String[] args) throws IOException {
+        int processors = Runtime.getRuntime().availableProcessors();
+
+        ExecutorService e = null;
+
+        List<Future<OpenMap>> futures = new ArrayList<>();
+        OpenMap mainMap = null;
+        try (FileChannel channel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+            long fileSize = channel.size();
+
+            if (fileSize < 10_000) {// File is smaller than 10,000 bytes, we will lose performance trying to multithread so just set processors to 1 which will skip the futures and only use main thread
+                processors = 1;
+            }
+            else {
+                e = Executors.newFixedThreadPool(processors);// Create a ThreadPool based executor using the count of processors available
+            }
+
+            long chunkSize = fileSize / processors;// Determine approximate size of each chunk based on amount of processors available
+
+            long startAddress = channel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global())// Map the file channel into memory using the global arena (accessible by all threads)
+                    .address();// And get the starting address of mapped section
+
+            long endAddress = startAddress + fileSize;
+            long currentAddress = startAddress + chunkSize;
+            long chunkStart = startAddress;
+
+            for (int i = 0; i < processors; i++) {// We need to chunk the file for each processor/thread
+
+                while (currentAddress < endAddress) {// While loop to locate the next new line character from the chunk we are in
+                    long match = UNSAFE.getLong(currentAddress);// Read the next 8 bytes as a long from the memory address
+                    short offset = getMaskOffset(match, NEW_LINE_BYTE);// find the byte in the long which equals 10 aka '\n', if it is not found this returns -1
+                    if (offset != -1) {// We found the offset, so add it to the current adress and break the while loop
+                        currentAddress += offset;
+                        break;
+                    }
+                    currentAddress += 8;// No offset was found so advance 8 bytes, aka 1 long
+                }
+
+                long finalChunkStart = chunkStart, finalChunkEnd = Math.min(endAddress, currentAddress - 1);// Create final fields to pass to the thread call below,
+                // Also Math.min doesn't matter here since its called x times where x = count of processors
+
+                if (i == processors - 1) {// if on last processor use main thread to optimize threading, doing on last processor means the others are already processing while this runs
+                    mainMap = process(finalChunkStart, finalChunkEnd);
+                }
+                else {
+                    futures.add(e.submit(() -> process(finalChunkStart, finalChunkEnd)));
+                }
+                chunkStart = currentAddress + 1;// Advance the start of the next chunk to be the end of this chunk + 1 to move past the new line character
+                currentAddress = Math.min(currentAddress + chunkSize, endAddress);// Advance the next chunks end to be the end of the mapped file or the end of the approximated chunk
+            }
+        }
+
+        OpenMap merged = mainMap;// Set the main map created with the process called on main thread to make it effectively final
+
+        if (processors > 1) {// If there is only one processor then we only used the main thread so no point in merging the futures
+            // The merging of processing takes ~10ms
+            for (Future<OpenMap> f : futures) {
+                try {
+
+                    OpenMap processed = f.get();// Waits until the process task is done but then returns the callable value from the process method
+
+                    // Simple way to merge both lists, tried doing it more inline inside the map and ended up taking a 10ms longer
+                    processed.forEach((i, s) -> {
+                        merged.merge(i, s);
+                    });
+                }
+                catch (InterruptedException | ExecutionException e1) {
+                    e1.printStackTrace();
+                }
+            }
+            // Mark threadpool to be shutdown, call it here to let the threadpool finish out while the rest of the processing occurs
+            e.shutdown();
+        }
+
+        // Ordering and printing takes 50ms
+        Station[] nameOrdered = merged.toArray();// Turn the merged map into an array to quickly sort it
+
+        Arrays.sort(nameOrdered, (n1, n2) -> n1.name.compareTo(n2.name));// Sort based on name, this might be optimizable based on the longs of the name, but would likely only gain some ms??
+
+        // Print results to the sys out
+        System.out.print("{");
+        for (int i = 0; i < nameOrdered.length; i++) {
+            if (i != 0) {
+                System.out.print(", ");
+            }
+            System.out.print(nameOrdered[i]);
+        }
+        System.out.print("}\n");// Need newline character to meet specs
+    }
+
+    // Core processing functionality, processes a chunk of memory
+    private static OpenMap process(long fromAddress, long toAddress) {
+
+        OpenMap stationsLookup = new OpenMap();// Create a new map for this specific chunk, this is also the returned value for the callable
+
+        long blockStart = fromAddress;
+        long currentAddress = fromAddress;
+
+        while (currentAddress < toAddress) {// Just keep looping until we exhaust the chunk
+
+            long read = 0l;
+            short offset = -1;
+            // The hash is a long hash based on the murmur3 algorithm. Look at the getMurmurHash3 method to find link
+            long hash = 1;
+
+            while ((offset = getMaskOffset(read = UNSAFE.getLong(currentAddress), SEPERATOR_BYTE)) == -1) {// Read and compute the hash until we locate the seperator byte 59 or ';'
+                currentAddress += 8;// forwardscan
+                hash = (997 * hash) ^ getMurmurHash3(991 * read);
+            }
+
+            // Compute the final hash based using the last read long but only the effective bits (anything before the byte 59 or ';').
+            // Using the OFFSET_CLEARS masks that are defined statically we can essentially segregate the important bits of the name based on the offset read above
+            hash = (997 * hash) ^ getMurmurHash3(991 * (read & OFFSET_CLEARS[offset]));
+
+            // Advance the current address/pointer to be 1 character past the end of the name Example: BillyJoel;29 would make the current address start at the '2' character
+            currentAddress += offset + 1;
+
+            Station station = stationsLookup.getOrCreate(hash, currentAddress, blockStart);
+
+            /*
+             * Possible combinations (x = number) -99.9 -> 99.9; ex: 54.4, -31.7, -4.5, 1.9
+             * x.x
+             * xx.x
+             * -x.x
+             * -xx.x
+             */
+
+            // Encoding is UTF8 however, since numbers in UTF8 are all single byte characters we can do some byte math to determin the number; 0=48 and 9=57, so character - 48 = number
+            // And since - and . are also single byte characters we can make some assumptions, leading us with the primary one that no matter what the number will be 3 to 5 bytes (see above combinations)
+            // Unfortunately since an integer is only 4 bytes we must read the long; Something to test would be to see if we could read an integer and then read an extra byte if it is the 5 character edge case
+            read = UNSAFE.getLong(currentAddress);
+
+            offset = 0;// reinitiate the offset to reuse the local address
+
+            byte sign = (byte) ((read >> offset) ^ 45);// Check the first byte of the new long to see if it is 45 aka '-', if it is this byte will be 0
+
+            // The logic below is based on the fact that we are reading
+            int num = sign == 0 ? (((byte) (read >> (offset += 8))) - 48) : (((byte) read) - 48);// Start the number reading, if it is a negative advance 8 bits in the long (8 bits = 1 byte)
+            currentAddress += 4;// There will always be at least 3 digits to read and the newline digit (4 total)
+            if ((byte) ((read >> (offset + 8)) ^ 46) != 0) {// There can only be one more possible number for cases of (XY.X | -XY.X) where Y is that other number
+                num *= 10;
+                num += ((byte) (read >> (offset += 8))) - 48;
+                currentAddress++;// Add one digit read if temp is 3 digits
+            }
+            num *= 10;
+            num += ((byte) (read >> (offset + 16))) - 48;// Read the decimal character (no matter what it is 16 bits past the offset here, since 8 bits is the last number and 8 bits is the decimal)
+            if (sign == 0) {
+                num *= -1;
+                currentAddress++;// Add another digit read for the negative sign
+            }
+
+            // Assign the values, don't use Math.min or any special bit manipulation. Faster to just use ternary
+            station.min = station.min < num ? station.min : num;
+            station.max = station.max > num ? station.max : num;
+            station.count++;
+            station.sum += num;
+            // And now set the next block to start at the current address
+            blockStart = currentAddress;
+        }
+        return stationsLookup;
+    }
+
+    // Avalanche hashing function for longs: https://github.com/aappleby/smhasher/blob/master/README.md
+    public final static long getMurmurHash3(long x) {
+        x ^= x >>> 33;
+        x *= 0xff51afd7ed558ccdL;
+        x ^= x >>> 33;
+        x *= 0xc4ceb9fe1a85ec53L;
+        x ^= x >>> 33;
+        return x;
+    }
+
+    // Simple way to identify if a byte is set in a long at any of the 8 spots, and also to get the offset of that byte.
+    // On average this is fast but certain cases could make it slow (checking 500,000,000,000 longs that don't have the test byte at all...)
+    private static short getMaskOffset(long value, byte test) {
+        for (short i = 0; i < 8; i++) {
+            if (((byte) value & 0xFF) == test) {
+                return i;
+            }
+            value = value >> 8;
+        }
+        return -1;
+    }
+
+    private static class Station {
+        private final long nameStart, nameEnd;// Store the starting and ending address of the name, to fill it later
+        private final int nameLength;
+        private int min = Integer.MAX_VALUE, max = Integer.MIN_VALUE, count;
+        private long sum;
+        private String name;
+
+        Station(long nameStart, long nameEnd) {
+            this.nameStart = nameStart;
+            this.nameEnd = nameEnd;
+            this.nameLength = (int) (nameEnd - nameStart) + 1;// Add 1 to include seperator
+        }
+
+        protected void fillName() {
+            byte[] nameBuffer = new byte[(int) (nameEnd - nameStart)];
+            UNSAFE.copyMemory(null, this.nameStart, nameBuffer, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameBuffer.length);// Quick memory copy, using null as src copies from the file we mapped earlier
+            name = new String(nameBuffer, StandardCharsets.UTF_8);
+        }
+
+        @Override
+        public String toString() {// Use decimal format to print numbers
+            return name + "=" + STATION_FORMAT.format(Math.round(min) * 0.1) + "/" + STATION_FORMAT.format(Math.round(((double) sum) / count) * 0.1) + "/"
+                    + STATION_FORMAT.format(Math.round(max) * 0.1);
+        }
+
+    }
+
+    public static class OpenMap {
+        public static final float LOAD_FACTOR = 0.75f;
+        public static final int EXPECTED_INITIAL_SIZE = 100_000;
+
+        protected transient long[] keys;// Use unboxed long values as a key, faster than a doing new HashMap<Long,X>() as with generics it will box/unbox every action (can be costly in large quantities)
+        protected transient Station[] values;
+        protected transient int capacity;
+        protected transient int maxFill;
+        protected transient int mask;
+        protected int size;
+
+        public OpenMap() {
+            // capacity = (int) getNextPowerOfTwo((long) Math.ceil(EXPECTED_INITIAL_SIZE / LOAD_FACTOR));// need to base the capacity on the next power of two for the mask to work properly
+            // initial size of 100k gives 262,144 Capacity, since we know this and its way oversized for a max of 10k keys theres no need to recalculate
+            capacity = 262_144;
+            mask = capacity - 1;
+            maxFill = (int) Math.ceil(capacity * 0.75f);// Only allow 75% of capacity before resizing
+            keys = new long[capacity];
+            values = new Station[capacity];
+        }
+
+        public void merge(long key, Station toMerge) {
+            // Simple compute function, if exists pass existing, if it doesn't pass null
+            int pos = (int) key & mask;// Key has already been hashed as we read, but cap it by mask
+            while (values[pos] != null) {
+                if (keys[pos] == key) {
+                    final Station oldValue = values[pos];
+
+                    // If names are different size but key was same, then continue to next step as hash collided
+                    // Compare memory values to see if the name is same as well, prevents hash collision
+                    if (oldValue.nameLength == toMerge.nameLength && compareMemory(toMerge.nameStart, oldValue.nameStart, oldValue.nameLength)) {
+                        // Memory was the same, making these the same station
+                        oldValue.count += toMerge.count;
+                        oldValue.sum += toMerge.sum;
+                        oldValue.min = oldValue.min < toMerge.min ? oldValue.min : toMerge.min;
+                        oldValue.max = oldValue.max > toMerge.max ? oldValue.max : toMerge.max;
+                        return;
+                    }
+                }
+                pos = (pos + 1) & mask;
+            }
+            keys[pos] = key;
+            values[pos] = toMerge;
+            size++;
+        }
+
+        public Station getOrCreate(final long key, long currentAddress, long blockStart) {
+            int pos = (int) key & mask;// Key has already been hashed as we read, but cap it by mask
+            while (values[pos] != null) {// While position is set
+                if (keys[pos] == key) {// Check if key is correct
+
+                    // If names are different size but key was same, then continue to next step as hash collided
+                    // Compare memory values to see if the name is same as well, prevents hash collision
+                    if (values[pos].nameLength == currentAddress - blockStart && compareMemory(blockStart, values[pos].nameStart, values[pos].nameLength)) {
+                        return values[pos];
+                    }
+                }
+                pos = (pos + 1) & mask;// Since this is an open map we keep checking next masked key for an open spot (Faster than tree or linked list on a specific node)
+            }
+            keys[pos] = key;
+            size++;
+            return values[pos] = new Station(blockStart, currentAddress - 1);// Since current address contains the splitter (we will subtract by 1 here, better to do here since this is only called when it doesn't exist less math = performance)
+        }
+
+        // Simple iterator for each set value
+        public void forEach(OpenConsumer consumer) {
+            for (int i = 0; i < this.capacity; i++) {
+                if (values[i] != null) {
+                    consumer.accept(keys[i], values[i]);
+                }
+            }
+        }
+
+        public Station[] toArray() {
+            Station[] array = new Station[size];
+            int setter = 0;
+            for (int i = 0; i < capacity; i++) {
+                if (values[i] != null) {
+                    array[setter++] = values[i];
+                    values[i].fillName();
+                }
+            }
+            return array;
+        }
+
+        // Bit function to get the next power of two on some number, used to determine best capacity based on initial size
+        public long getNextPowerOfTwo(long length) {
+            if (length-- == 0)
+                return 1;
+            length |= length >> 1;
+            length |= length >> 2;
+            length |= length >> 4;
+            length |= length >> 8;
+            length |= length >> 16;
+            return (length | length >> 32) + 1;
+        }
+
+        private boolean compareMemory(long start1, long start2, int length) {
+            while (length > 0) {
+                if (length >= 8) {
+                    if (UNSAFE.getLong(start1) != UNSAFE.getLong(start2)) {
+                        return false;
+                    }
+                }
+                else {
+                    if ((UNSAFE.getLong(start1) & OFFSET_CLEARS[length]) != (UNSAFE.getLong(start2) & OFFSET_CLEARS[length])) {
+                        System.out.println("Found collision: " + start1 + ": " + start2);
+                        System.out.println("Found collision: " + UNSAFE.getLong(start1) + ": " + UNSAFE.getLong(start2));
+                        System.out.println("Length: " + length);
+                        return false;
+                    }
+                }
+                length -= 8;
+                start1 += 8;
+                start2 += 8;
+            }
+            return true;
+        }
+
+        @FunctionalInterface
+        public static interface OpenConsumer {
+            void accept(long key, Station value);
+        }
+
+        @FunctionalInterface
+        public static interface OpenFunction {
+            Station action(long key, Station value);
+        }
+
+    }
+
+}

From 4daeb94b048e074c2b80aac1074b68eb92285ea8 Mon Sep 17 00:00:00 2001
From: Alexander Yastrebov <yastrebov.alex@gmail.com>
Date: Tue, 23 Jan 2024 20:49:28 +0100
Subject: [PATCH 122/268] Go implementation by AlexanderYastrebov (#298)

* Go implementation by AlexanderYastrebov

This is a proof-of-concept to demonstrate non-java submission.
It requires Docker with BuildKit plugin to build and export binary.

Updates
* #67
* #253

* Use collision-free id lookup

* Use number of buckets greater than max number of keys
---
 .gitignore                                  |   1 +
 calculate_average_AlexanderYastrebov.sh     |  20 ++
 github_users.txt                            |   1 +
 prepare_AlexanderYastrebov.sh               |  18 ++
 src/main/go/AlexanderYastrebov/Dockerfile   |  22 ++
 src/main/go/AlexanderYastrebov/README.md    |  58 ++++
 src/main/go/AlexanderYastrebov/calc.go      | 283 ++++++++++++++++++++
 src/main/go/AlexanderYastrebov/calc_test.go |  86 ++++++
 src/main/go/AlexanderYastrebov/go.mod       |   3 +
 9 files changed, 492 insertions(+)
 create mode 100755 calculate_average_AlexanderYastrebov.sh
 create mode 100755 prepare_AlexanderYastrebov.sh
 create mode 100644 src/main/go/AlexanderYastrebov/Dockerfile
 create mode 100644 src/main/go/AlexanderYastrebov/README.md
 create mode 100644 src/main/go/AlexanderYastrebov/calc.go
 create mode 100644 src/main/go/AlexanderYastrebov/calc_test.go
 create mode 100644 src/main/go/AlexanderYastrebov/go.mod

diff --git a/.gitignore b/.gitignore
index 6a1ef02bc..fd1dcbd0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,4 @@ out/
 /measurements*.txt
 /*.out
 out_expected.txt
+/*-timing.json
diff --git a/calculate_average_AlexanderYastrebov.sh b/calculate_average_AlexanderYastrebov.sh
new file mode 100755
index 000000000..ea951bd4e
--- /dev/null
+++ b/calculate_average_AlexanderYastrebov.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+INPUT=${1:-"measurements.txt"}
+
+target/AlexanderYastrebov/1brc "$INPUT"
diff --git a/github_users.txt b/github_users.txt
index 1e640d8e0..497909c78 100644
--- a/github_users.txt
+++ b/github_users.txt
@@ -53,3 +53,4 @@ vemana;Subrahmanyam
 jincongho;Jin Cong Ho
 yonatang;Yonatan Graber
 adriacabeza;Adrià Cabeza
+AlexanderYastrebov;Alexander Yastrebov
diff --git a/prepare_AlexanderYastrebov.sh b/prepare_AlexanderYastrebov.sh
new file mode 100755
index 000000000..3521ecb03
--- /dev/null
+++ b/prepare_AlexanderYastrebov.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+DOCKER_BUILDKIT=1 docker build -o target/AlexanderYastrebov src/main/go/AlexanderYastrebov
diff --git a/src/main/go/AlexanderYastrebov/Dockerfile b/src/main/go/AlexanderYastrebov/Dockerfile
new file mode 100644
index 000000000..a3b28067f
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/Dockerfile
@@ -0,0 +1,22 @@
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+FROM golang AS build-stage
+COPY . src/
+RUN cd src && go build .
+
+FROM scratch AS export-stage
+COPY --from=build-stage /go/src/1brc /
diff --git a/src/main/go/AlexanderYastrebov/README.md b/src/main/go/AlexanderYastrebov/README.md
new file mode 100644
index 000000000..dc0119252
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/README.md
@@ -0,0 +1,58 @@
+# 1brc in go
+
+It uses Docker with BuildKit plugin to build and [export binary](https://docs.docker.com/engine/reference/commandline/build/#output) binary,
+see [prepare_AlexanderYastrebov.sh](../../../../prepare_AlexanderYastrebov.sh)
+and [calculate_average_AlexanderYastrebov.sh](../../../../calculate_average_AlexanderYastrebov.sh).
+
+Demo:
+```sh
+$ ./test.sh AlexanderYastrebov
+[+] Building 0.2s (9/9) FINISHED
+ => [internal] load .dockerignore                                                                                                             0.0s
+ => => transferring context: 2B                                                                                                               0.0s
+ => [internal] load build definition from Dockerfile                                                                                          0.0s
+ => => transferring dockerfile: 172B                                                                                                          0.0s
+ => [internal] load metadata for docker.io/library/golang:latest                                                                              0.0s
+ => [internal] load build context                                                                                                             0.0s
+ => => transferring context: 145B                                                                                                             0.0s
+ => [build-stage 1/3] FROM docker.io/library/golang                                                                                           0.0s
+ => CACHED [build-stage 2/3] COPY . src/                                                                                                      0.0s
+ => CACHED [build-stage 3/3] RUN cd src && go build .                                                                                         0.0s
+ => CACHED [export-stage 1/1] COPY --from=build-stage /go/src/1brc /                                                                          0.0s
+ => exporting to client directory                                                                                                             0.1s
+ => => copying files 2.03MB                                                                                                                   0.0s
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10000-unique-keys.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-1.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-20.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-2.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-3.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-boundaries.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-complex-utf8.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-dot.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-shortest.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-short.txt
+
+# Run once to setup the benchmark
+# ./create_measurements.sh 1000000000
+# mv measurements.txt measurements_1B.txt
+# ln -s measurements_1B.txt measurements.txt
+# ./calculate_average_baseline.sh > out_expected.txt
+
+$ wc -l measurements_1B.txt
+1000000000 measurements_1B.txt
+
+$ ./evaluate2.sh AlexanderYastrebov royvanrijn
+...                                                                                                                                         0.0s
+Benchmark 1: ./calculate_average_AlexanderYastrebov.sh 2>&1
+  Time (mean ± σ):     16.786 s ±  0.545 s    [User: 56.030 s, System: 10.068 s]
+  Range (min … max):   15.918 s … 17.309 s    5 runs
+...
+Benchmark 1: ./calculate_average_royvanrijn.sh 2>&1
+  Time (mean ± σ):     16.731 s ±  0.190 s    [User: 56.485 s, System: 10.279 s]
+  Range (min … max):   16.490 s … 16.951 s    5 runs
+
+Summary
+  AlexanderYastrebov: trimmed mean 16.901712789513336, raw times 16.69836470718,17.30911065018,16.83413600418,15.91787706218,17.17263765718
+  royvanrijn: trimmed mean 16.738037123633333, raw times 16.4900939703,16.9513459953,16.5794539913,16.8297746273,16.8048827523
+```
diff --git a/src/main/go/AlexanderYastrebov/calc.go b/src/main/go/AlexanderYastrebov/calc.go
new file mode 100644
index 000000000..149d38db7
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/calc.go
@@ -0,0 +1,283 @@
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"log"
+	"math"
+	"os"
+	"runtime"
+	"sort"
+	"sync"
+	"syscall"
+)
+
+type measurement struct {
+	min, max, sum, count int64
+}
+
+func main() {
+	if len(os.Args) != 2 {
+		log.Fatalf("Missing measurements filename")
+	}
+
+	measurements := processFile(os.Args[1])
+
+	ids := make([]string, 0, len(measurements))
+	for id := range measurements {
+		ids = append(ids, id)
+	}
+	sort.Strings(ids)
+
+	fmt.Print("{")
+	for i, id := range ids {
+		if i > 0 {
+			fmt.Print(", ")
+		}
+		m := measurements[id]
+		fmt.Printf("%s=%.1f/%.1f/%.1f", id, round(float64(m.min)/10.0), round(float64(m.sum)/10.0/float64(m.count)), round(float64(m.max)/10.0))
+	}
+	fmt.Println("}")
+}
+
+func processFile(filename string) map[string]*measurement {
+	f, err := os.Open(filename)
+	if err != nil {
+		log.Fatalf("Open: %v", err)
+	}
+	defer f.Close()
+
+	fi, err := f.Stat()
+	if err != nil {
+		log.Fatalf("Stat: %v", err)
+	}
+
+	size := fi.Size()
+	if size <= 0 || size != int64(int(size)) {
+		log.Fatalf("Invalid file size: %d", size)
+	}
+
+	data, err := syscall.Mmap(int(f.Fd()), 0, int(size), syscall.PROT_READ, syscall.MAP_SHARED)
+	if err != nil {
+		log.Fatalf("Mmap: %v", err)
+	}
+
+	defer func() {
+		if err := syscall.Munmap(data); err != nil {
+			log.Fatalf("Munmap: %v", err)
+		}
+	}()
+
+	return process(data)
+}
+
+func process(data []byte) map[string]*measurement {
+	nChunks := runtime.NumCPU()
+
+	chunkSize := len(data) / nChunks
+	if chunkSize == 0 {
+		chunkSize = len(data)
+	}
+
+	chunks := make([]int, 0, nChunks)
+	offset := 0
+	for offset < len(data) {
+		offset += chunkSize
+		if offset >= len(data) {
+			chunks = append(chunks, len(data))
+			break
+		}
+
+		nlPos := bytes.IndexByte(data[offset:], '\n')
+		if nlPos == -1 {
+			chunks = append(chunks, len(data))
+			break
+		} else {
+			offset += nlPos + 1
+			chunks = append(chunks, offset)
+		}
+	}
+
+	var wg sync.WaitGroup
+	wg.Add(len(chunks))
+
+	results := make([]map[string]*measurement, len(chunks))
+	start := 0
+	for i, chunk := range chunks {
+		go func(data []byte, i int) {
+			results[i] = processChunk(data)
+			wg.Done()
+		}(data[start:chunk], i)
+		start = chunk
+	}
+	wg.Wait()
+
+	measurements := make(map[string]*measurement)
+	for _, r := range results {
+		for id, rm := range r {
+			m := measurements[id]
+			if m == nil {
+				measurements[id] = rm
+			} else {
+				m.min = min(m.min, rm.min)
+				m.max = max(m.max, rm.max)
+				m.sum += rm.sum
+				m.count += rm.count
+			}
+		}
+	}
+	return measurements
+}
+
+func processChunk(data []byte) map[string]*measurement {
+	// Use fixed size linear probe lookup table
+	const (
+		// use power of 2 for fast modulo calculation,
+		// should be larger than max number of keys which is 10_000
+		entriesSize = 1 << 14
+
+		// use FNV-1a hash
+		fnv1aOffset64 = 14695981039346656037
+		fnv1aPrime64  = 1099511628211
+	)
+
+	type entry struct {
+		m     measurement
+		hash  uint64
+		vlen  int
+		value [128]byte // use power of 2 > 100 for alignment
+	}
+	entries := make([]entry, entriesSize)
+	entriesCount := 0
+
+	// keep short and inlinable
+	getMeasurement := func(hash uint64, value []byte) *measurement {
+		i := hash & uint64(entriesSize-1)
+		entry := &entries[i]
+
+		// bytes.Equal could be commented to speedup assuming no hash collisions
+		for entry.vlen > 0 && !(entry.hash == hash && bytes.Equal(entry.value[:entry.vlen], value)) {
+			i = (i + 1) & uint64(entriesSize-1)
+			entry = &entries[i]
+		}
+
+		if entry.vlen == 0 {
+			entry.hash = hash
+			entry.vlen = copy(entry.value[:], value)
+			entriesCount++
+		}
+		return &entry.m
+	}
+
+	// assume valid input
+	for len(data) > 0 {
+
+		idHash := uint64(fnv1aOffset64)
+		semiPos := 0
+		for i, b := range data {
+			if b == ';' {
+				semiPos = i
+				break
+			}
+
+			// calculate FNV-1a hash
+			idHash ^= uint64(b)
+			idHash *= fnv1aPrime64
+		}
+
+		idData := data[:semiPos]
+
+		data = data[semiPos+1:]
+
+		var temp int64
+		// parseNumber
+		{
+			negative := data[0] == '-'
+			if negative {
+				data = data[1:]
+			}
+
+			_ = data[3]
+			if data[1] == '.' {
+				// 1.2\n
+				temp = int64(data[0])*10 + int64(data[2]) - '0'*(10+1)
+				data = data[4:]
+				// 12.3\n
+			} else {
+				_ = data[4]
+				temp = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1)
+				data = data[5:]
+			}
+
+			if negative {
+				temp = -temp
+			}
+		}
+
+		m := getMeasurement(idHash, idData)
+		if m.count == 0 {
+			m.min = temp
+			m.max = temp
+			m.sum = temp
+			m.count = 1
+		} else {
+			m.min = min(m.min, temp)
+			m.max = max(m.max, temp)
+			m.sum += temp
+			m.count++
+		}
+	}
+
+	result := make(map[string]*measurement, entriesCount)
+	for i := range entries {
+		entry := &entries[i]
+		if entry.m.count > 0 {
+			result[string(entry.value[:entry.vlen])] = &entry.m
+		}
+	}
+	return result
+}
+
+func round(x float64) float64 {
+	return roundJava(x*10.0) / 10.0
+}
+
+// roundJava returns the closest integer to the argument, with ties
+// rounding to positive infinity, see java's Math.round
+func roundJava(x float64) float64 {
+	t := math.Trunc(x)
+	if x < 0.0 && t-x == 0.5 {
+		//return t
+	} else if math.Abs(x-t) >= 0.5 {
+		t += math.Copysign(1, x)
+	}
+
+	if t == 0 { // check -0
+		return 0.0
+	}
+	return t
+}
+
+// parseNumber reads decimal number that matches "^-?[0-9]{1,2}[.][0-9]" pattern,
+// e.g.: -12.3, -3.4, 5.6, 78.9 and return the value*10, i.e. -123, -34, 56, 789.
+func parseNumber(data []byte) int64 {
+	negative := data[0] == '-'
+	if negative {
+		data = data[1:]
+	}
+
+	var result int64
+	switch len(data) {
+	// 1.2
+	case 3:
+		result = int64(data[0])*10 + int64(data[2]) - '0'*(10+1)
+	// 12.3
+	case 4:
+		result = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1)
+	}
+
+	if negative {
+		return -result
+	}
+	return result
+}
diff --git a/src/main/go/AlexanderYastrebov/calc_test.go b/src/main/go/AlexanderYastrebov/calc_test.go
new file mode 100644
index 000000000..db7e27a2c
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/calc_test.go
@@ -0,0 +1,86 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"testing"
+)
+
+func TestRoundJava(t *testing.T) {
+	for _, tc := range []struct {
+		value    float64
+		expected string
+	}{
+		{value: -1.5, expected: "-1.0"},
+		{value: -1.0, expected: "-1.0"},
+		{value: -0.7, expected: "-1.0"},
+		{value: -0.5, expected: "0.0"},
+		{value: -0.3, expected: "0.0"},
+		{value: 0.0, expected: "0.0"},
+		{value: 0.3, expected: "0.0"},
+		{value: 0.5, expected: "1.0"},
+		{value: 0.7, expected: "1.0"},
+		{value: 1.0, expected: "1.0"},
+		{value: 1.5, expected: "2.0"},
+	} {
+		if rounded := roundJava(tc.value); fmt.Sprintf("%.1f", rounded) != tc.expected {
+			t.Errorf("Wrong rounding of %v, expected: %s, got: %.1f", tc.value, tc.expected, rounded)
+		}
+	}
+}
+
+func TestParseNumber(t *testing.T) {
+	for _, tc := range []struct {
+		value    string
+		expected string
+	}{
+		{value: "-99.9", expected: "-999"},
+		{value: "-12.3", expected: "-123"},
+		{value: "-1.5", expected: "-15"},
+		{value: "-1.0", expected: "-10"},
+		{value: "0.0", expected: "0"},
+		{value: "0.3", expected: "3"},
+		{value: "12.3", expected: "123"},
+		{value: "99.9", expected: "999"},
+	} {
+		if number := parseNumber([]byte(tc.value)); fmt.Sprintf("%d", number) != tc.expected {
+			t.Errorf("Wrong parsing of %v, expected: %s, got: %d", tc.value, tc.expected, number)
+		}
+	}
+}
+
+var parseNumberSink int64
+
+func BenchmarkParseNumber(b *testing.B) {
+	data1 := []byte("1.2")
+	data2 := []byte("-12.3")
+
+	for i := 0; i < b.N; i++ {
+		parseNumberSink = parseNumber(data1) + parseNumber(data2)
+	}
+}
+
+func BenchmarkProcess(b *testing.B) {
+	// $ ./create_measurements.sh 1000000 && mv measurements.txt measurements-1e6.txt
+	// Created file with 1,000,000 measurements in 514 ms
+	const filename = "../../../../measurements-1e6.txt"
+
+	data, err := os.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	measurements := process(data)
+	rows := int64(0)
+	for _, m := range measurements {
+		rows += m.count
+	}
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	b.ReportMetric(float64(rows), "rows/op")
+
+	for i := 0; i < b.N; i++ {
+		process(data)
+	}
+}
diff --git a/src/main/go/AlexanderYastrebov/go.mod b/src/main/go/AlexanderYastrebov/go.mod
new file mode 100644
index 000000000..08f5bd193
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/go.mod
@@ -0,0 +1,3 @@
+module github.com/AlexanderYastrebov/1brc
+
+go 1.21.5

From 055916e9978aa7fbf6ab70058fcd991016fdb153 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 24 Jan 2024 10:56:03 +0100
Subject: [PATCH 123/268] Leaderboard update

---
 README.md | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index c432fc7dd..f393b69c7 100644
--- a/README.md
+++ b/README.md
@@ -41,14 +41,19 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 2 | 00:02.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.305 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.313 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen]
+(https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.196 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 3 | 00:02.305 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
-|   | 00:02.909 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
+|   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
+|   | 00:03.510 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
+|   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
+|   | 00:03.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
 |   | 00:03.854 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
@@ -57,11 +62,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
-|   | 00:04.551 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | uses Unsafe |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
-|   | 00:04.823 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | uses Unsafe |
+|   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:04.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) | uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
@@ -73,16 +76,18 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |
 |   | 00:06.415 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) | uses Unsafe |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) | uses Unsafe |
+|   | 00:06.635 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java)| 21.0.1-graal | [Laake Scates-Gervasi](https://github.com/justplainlaake) | GraalVM native binary, uses Unsafe |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.670 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
+|   | 00:07.563 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java)| 21.0.1-graal | [3j5a](https://github.com/3j5a) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
-|   | 00:07.913 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [parkertimmins](https://github.com/parkertimmins) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
+|   | 00:08.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mattiz.java)| 21.0.1-open | [Mathias Bjerke](https://github.com/mattiz) |  |
 |   | 00:08.398 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artpar.java)| 21.0.1-open | [Parth Mudgal](https://github.com/artpar) | uses Unsafe |
 |   | 00:08.489 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnabyl.java)| 21.0.1-graal | [Bang NGUYEN](https://github.com/gnabyl) |  |
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) | uses Unsafe |
@@ -101,14 +106,14 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.577 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_netrunnereve.java)| 21.0.1-open | [Eve](https://github.com/netrunnereve) |  |
 |   | 00:10.473 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_raipc.java)| 21.0.1-open | [Anton Rybochkin](https://github.com/raipc) |  |
 |   | 00:11.119 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_lawrey.java)| 21.0.1-open | [lawrey](https://github.com/lawrey) |  |
+|   | 00:11.156 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java)| java | [Yann Moisan](https://github.com/YannMoisan) |  |
 |   | 00:11.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_palmr.java)| 21.0.1-open | [Nick Palmer](https://github.com/palmr) |  |
+|   | 00:11.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) | uses Unsafe |
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
-|   | 00:11.878 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) | uses Unsafe |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
 |   | 00:12.051 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
-|   | 00:12.102 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java)| java | [Yann Moisan](https://github.com/YannMoisan) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
@@ -121,6 +126,8 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
+|   | 00:16.063 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java)| 21.0.1-open | [Marek Kohn](https://github.com/makohn) |  |
+|   | 00:16.953 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java)| 21.0.1-open | [Gaurav Anantrao Deshmukh](https://github.com/gauravdeshmukh) |  |
 |   | 00:17.179 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
@@ -154,6 +161,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 01:06.790 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_khmarbaise.java)| 21.0.1-open | [Karl Heinz Marbaise](https://github.com/khmarbaise) |  |
 |   | 01:06.944 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_santanu.java)| 21.0.1-open | [santanu](https://github.com/santanu) |  |
 |   | 01:07.014 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_pedestrianlove.java)| 21.0.1-open | [pedestrianlove](https://github.com/pedestrianlove) |  |
+|   | 01:07.101 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jeevjyot.java)| 21.0.1-open | [Jeevjyot Singh Chhabda](https://github.com/jeevjyot) |  |
 |   | 01:08.811 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_alesj.java)| 21.0.1-open | [Aleš Justin](https://github.com/alesj) |  |
 |   | 01:08.908 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_itaske.java)| 21.0.1-open | [itaske](https://github.com/itaske) |  |
 |   | 01:09.595 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_agoncal.java)| 21.0.1-tem | [Antonio Goncalves](https://github.com/agoncal) |  |

From 7a9b88f32891acd502710b9bbdfa72a260d088f4 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 24 Jan 2024 10:57:28 +0100
Subject: [PATCH 124/268] Leaderboard

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f393b69c7..3818d6516 100644
--- a/README.md
+++ b/README.md
@@ -42,8 +42,7 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen]
-(https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 | 2* | 00:02.196 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 | 3 | 00:02.305 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |

From 0b762ee0b0f7b8ffede0b090cfc8f74c3b5ae31a Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 24 Jan 2024 12:18:01 +0100
Subject: [PATCH 125/268] Fixing Thomas' JDK version

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3818d6516..e2a937378 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 | 2* | 00:02.196 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 | 3 | 00:02.305 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |

From 561717a986550ab7898005b9fa20f8e7dcb699a5 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 24 Jan 2024 20:37:10 +0100
Subject: [PATCH 126/268] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e2a937378..4cfd4f45c 100644
--- a/README.md
+++ b/README.md
@@ -413,6 +413,7 @@ A list of external resources such as blog posts and videos, discussing 1BRC and
 * [One Billion Row Challenge in Racket](https://defn.io/2024/01/10/one-billion-row-challenge-in-racket/), by Bogdan Popa (blog post)
 * [The One Billion Row Challenge - .NET Edition](https://dev.to/mergeconflict/392-the-one-billion-row-challenge-net-edition), by Frank A. Krueger (podcast)
 * [One Billion Row Challenge](https://curiouscoding.nl/posts/1brc/), by Ragnar Groot Koerkamp (blog post)
+* [ClickHouse and The One Billion Row Challenge](https://clickhouse.com/blog/clickhouse-one-billion-row-challenge), by Dale McDiarmid (blog post)
 
 ## Sponsorship
 

From 80cd738b4b9c1ad2d3fa5b4ae3d0f6e625e946e4 Mon Sep 17 00:00:00 2001
From: Vemana <vemana.github@gmail.com>
Date: Fri, 26 Jan 2024 03:10:14 +0530
Subject: [PATCH 127/268] C style code. Should be ~4secs or lower based on
 local testing. (#559)

1. Use Unsafe
2. Fit hashtable in L2 cache.
3. If we can find a good hash function, it can fit in L1 cache even.
4. Improve temperature parsing by using a lookup table
---
 calculate_average_vemanaNonIdiomatic.sh       |   31 +
 prepare_vemanaNonIdiomatic.sh                 |   20 +
 .../CalculateAverage_vemanaNonIdiomatic.java  | 1654 +++++++++++++++++
 3 files changed, 1705 insertions(+)
 create mode 100755 calculate_average_vemanaNonIdiomatic.sh
 create mode 100755 prepare_vemanaNonIdiomatic.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java

diff --git a/calculate_average_vemanaNonIdiomatic.sh b/calculate_average_vemanaNonIdiomatic.sh
new file mode 100755
index 000000000..99974ee03
--- /dev/null
+++ b/calculate_average_vemanaNonIdiomatic.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Basics
+JAVA_OPTS=""
+JAVA_OPTS="$JAVA_OPTS --enable-preview"
+JAVA_OPTS="$JAVA_OPTS --add-exports java.base/jdk.internal.ref=ALL-UNNAMED"
+JAVA_OPTS="$JAVA_OPTS --add-opens java.base/java.nio=ALL-UNNAMED"
+
+# JIT parameters
+JAVA_OPTS="$JAVA_OPTS -XX:+AlwaysCompileLoopMethods"
+
+# GC parameters
+JAVA_OPTS="$JAVA_OPTS -XX:+UseParallelGC"
+
+
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_vemanaNonIdiomatic "$@"
diff --git a/prepare_vemanaNonIdiomatic.sh b/prepare_vemanaNonIdiomatic.sh
new file mode 100755
index 000000000..58dbc240f
--- /dev/null
+++ b/prepare_vemanaNonIdiomatic.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
+
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java b/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java
new file mode 100644
index 000000000..10b9c1e89
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java
@@ -0,0 +1,1654 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.invoke.MethodHandles;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Optional;
+import java.util.TreeMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import sun.misc.Unsafe;
+
+/**
+ * Unlike its sister submission {@code CalculateAverage_vemana}, this submission employs non
+ * idiomatic methods such as SWAR and Unsafe.
+ *
+ * <p>For details on how this solution works, check the documentation on the sister submission.
+ */
+public class CalculateAverage_vemanaNonIdiomatic {
+
+  public static void main(String[] args) throws Exception {
+    String className = MethodHandles.lookup().lookupClass().getSimpleName();
+    System.err.println(
+        STR."""
+        ------------------------------------------------
+        Running \{className}
+        -------------------------------------------------
+        """);
+    Tracing.recordAppStart();
+    Runtime.getRuntime()
+        .addShutdownHook(
+            new Thread(
+                () -> {
+                  Tracing.recordEvent("In Shutdown hook");
+                }));
+
+    // First process in large chunks without coordination among threads
+    // Use chunkSizeBits for the large-chunk size
+    int chunkSizeBits = 20;
+
+    // For the last commonChunkFraction fraction of total work, use smaller chunk sizes
+    double commonChunkFraction = 0.03;
+
+    // Use commonChunkSizeBits for the small-chunk size
+    int commonChunkSizeBits = 18;
+
+    // Size of the hashtable (attempt to fit in L2 of 512KB of eval machine)
+    int hashtableSizeBits = className.toLowerCase().contains("nonidiomatic") ? 13 : 16;
+
+    // Reserve some number of lines at the end to give us freedom in reading LONGs past ranges
+    int minReservedBytesAtFileTail = 9;
+
+    // Number of threads
+    int nThreads = -1;
+
+    String inputFile = "measurements.txt";
+
+    // Parallelize unmap. Thread #n (n=1,2,..N) unmaps its bytebuffer when
+    // munmapFraction * n work remains.
+    double munmapFraction = 0.03;
+
+    boolean fakeAdvance = false;
+
+    for (String arg : args) {
+      String key = arg.substring(0, arg.indexOf('=')).trim();
+      String value = arg.substring(key.length() + 1).trim();
+      switch (key) {
+        case "chunkSizeBits":
+          chunkSizeBits = Integer.parseInt(value);
+          break;
+        case "commonChunkFraction":
+          commonChunkFraction = Double.parseDouble(value);
+          break;
+        case "commonChunkSizeBits":
+          commonChunkSizeBits = Integer.parseInt(value);
+          break;
+        case "hashtableSizeBits":
+          hashtableSizeBits = Integer.parseInt(value);
+          break;
+        case "inputFile":
+          inputFile = value;
+          break;
+        case "munmapFraction":
+          munmapFraction = Double.parseDouble(value);
+          break;
+        case "fakeAdvance":
+          fakeAdvance = Boolean.parseBoolean(value);
+          break;
+        case "nThreads":
+          nThreads = Integer.parseInt(value);
+          break;
+        default:
+          throw new IllegalArgumentException("Unknown argument: " + arg);
+      }
+    }
+
+    System.out.println(
+        new Runner(
+                Path.of(inputFile),
+                nThreads,
+                chunkSizeBits,
+                commonChunkFraction,
+                commonChunkSizeBits,
+                hashtableSizeBits,
+                minReservedBytesAtFileTail,
+                munmapFraction,
+                fakeAdvance)
+            .getSummaryStatistics());
+
+    Tracing.recordEvent("Final result printed");
+  }
+
+  public record AggregateResult(Map<String, Stat> tempStats) {
+
+    @Override
+    public String toString() {
+      return this.tempStats().entrySet().stream()
+          .sorted(Map.Entry.comparingByKey())
+          .map(entry -> "%s=%s".formatted(entry.getKey(), entry.getValue()))
+          .collect(Collectors.joining(", ", "{", "}"));
+    }
+  }
+
+    // Mutable to avoid allocation
+    public static class ByteRange {
+
+        private static final int BUF_SIZE = 1 << 28;
+
+        private final long fileSize;
+        private final long maxEndPos; // Treat as if the file ends here
+        private final RandomAccessFile raf;
+        private final int shardIdx;
+        private final List<MappedByteBuffer> unclosedBuffers = new ArrayList<>();
+        // ***************** What this is doing and why *****************
+        // Reading from ByteBuffer appears faster from MemorySegment, but ByteBuffer can only be
+        // Integer.MAX_VALUE long; Creating one byteBuffer per chunk kills native memory quota
+        // and JVM crashes without futher parameters.
+        //
+        // So, in this solution, create a sliding window of bytebuffers:
+        // - Create a large bytebuffer that spans the chunk
+        // - If the next chunk falls outside the byteBuffer, create another byteBuffer that spans the
+        // chunk. Because chunks are allocated serially, a single large (1<<30) byteBuffer spans
+        // many successive chunks.
+        // - In fact, for serial chunk allocation (which is friendly to page faulting anyway),
+        // the number of created ByteBuffers doesn't exceed [size of shard/(1<<30)] which is less than
+        // 100/thread and is comfortably below what the JVM can handle (65K) without further param
+        // tuning
+        // - This enables (relatively) allocation free chunking implementation. Our chunking impl uses
+        // fine grained chunking for the last say X% of work to avoid being hostage to stragglers
+
+        ///////////// The PUBLIC API
+
+        public MappedByteBuffer byteBuffer;
+        public long endAddress; // the virtual memory address corresponding to 'endInBuf'
+        public int endInBuf; // where the chunk ends inside the buffer
+        public long startAddress; // the virtual memory address corresponding to 'startInBuf'
+        public int startInBuf; // where the chunk starts inside the buffer
+
+        ///////////// Private State
+
+        long bufferBaseAddr; // buffer's base virtual memory address
+        long extentEnd; // byteBuffer's ending coordinate
+        long extentStart; // byteBuffer's begin coordinate
+
+        // Uninitialized; for mutability
+        public ByteRange(RandomAccessFile raf, long maxEndPos, int shardIdx) {
+            this.raf = raf;
+            this.maxEndPos = maxEndPos;
+            this.shardIdx = shardIdx;
+            try {
+                this.fileSize = raf.length();
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            bufferCleanSlate();
+        }
+
+        public void close(String closerId) {
+            Tracing.recordWorkStart(closerId, shardIdx);
+            bufferCleanSlate();
+            for (MappedByteBuffer buf : unclosedBuffers) {
+                close(buf);
+            }
+            unclosedBuffers.clear();
+            Tracing.recordWorkEnd(closerId, shardIdx);
+        }
+
+        public void setRange(long rangeStart, long rangeEnd) {
+            if (rangeEnd + 1024 > extentEnd || rangeStart < extentStart) {
+                setByteBufferExtent(rangeStart, Math.min(rangeStart + BUF_SIZE, fileSize));
+            }
+
+            if (rangeStart > 0) {
+                rangeStart = 1 + nextNewLine(rangeStart);
+            }
+            else {
+                rangeStart = 0;
+            }
+
+            if (rangeEnd < maxEndPos) {
+                // rangeEnd = 1 + nextNewLine(rangeEnd); // not needed
+                rangeEnd = 1 + rangeEnd;
+            }
+            else {
+                rangeEnd = maxEndPos;
+            }
+
+            startInBuf = (int) (rangeStart - extentStart);
+            endInBuf = (int) (rangeEnd - extentStart);
+            startAddress = bufferBaseAddr + startInBuf;
+            endAddress = bufferBaseAddr + endInBuf;
+        }
+
+    @Override
+    public String toString() {
+      return STR."""
+        ByteRange {
+          shard                 = \{shardIdx}
+          extentStart           = \{extentStart}
+          extentEnd             = \{extentEnd}
+          startInBuf            = \{startInBuf}
+          endInBuf              = \{endInBuf}
+          startAddress          = \{startAddress}
+          endAddress            = \{endAddress}
+        }
+        """;
+    }
+
+        private void bufferCleanSlate() {
+            if (byteBuffer != null) {
+                unclosedBuffers.add(byteBuffer);
+                byteBuffer = null;
+            }
+            extentEnd = extentStart = bufferBaseAddr = startAddress = endAddress = -1;
+        }
+
+        private void close(MappedByteBuffer buffer) {
+            Method cleanerMethod = Reflection.findMethodNamed(buffer, "cleaner");
+            cleanerMethod.setAccessible(true);
+            Object cleaner = Reflection.invoke(buffer, cleanerMethod);
+
+            Method cleanMethod = Reflection.findMethodNamed(cleaner, "clean");
+            cleanMethod.setAccessible(true);
+            Reflection.invoke(cleaner, cleanMethod);
+        }
+
+        private long getBaseAddr(MappedByteBuffer buffer) {
+            Method addressMethod = Reflection.findMethodNamed(buffer, "address");
+            addressMethod.setAccessible(true);
+            return (long) Reflection.invoke(buffer, addressMethod);
+        }
+
+        private long nextNewLine(long pos) {
+            int nextPos = (int) (pos - extentStart);
+            while (byteBuffer.get(nextPos) != '\n') {
+                nextPos++;
+            }
+            return nextPos + extentStart;
+        }
+
+        /**
+         * Extent different from Range. Range is what needs to be processed. Extent is what the byte
+         * buffer can read without failing.
+         */
+        private void setByteBufferExtent(long start, long end) {
+            bufferCleanSlate();
+            try {
+                byteBuffer = raf.getChannel().map(MapMode.READ_ONLY, start, end - start);
+                byteBuffer.order(ByteOrder.nativeOrder());
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            extentStart = start;
+            extentEnd = end;
+            bufferBaseAddr = getBaseAddr(byteBuffer);
+        }
+    }
+
+    public static final class Checks {
+
+        public static void checkArg(boolean condition) {
+            if (!condition) {
+                throw new IllegalArgumentException();
+            }
+        }
+
+        private Checks() {
+        }
+    }
+
+    /*
+     * ENTRY SHAPE
+     * Ensure alignment boundaries. 4 bytes on 4 byte, 2 bytes on 2 byte etc.
+     * 32 bytes per entry.
+     * 96 KB L1 cache. 2048 entries should fully fit
+     * -------------------
+     * str: 14 bytes [Defined by constant STR_FIELD_LEN]
+     * hash: 2 bytes
+     * cityNameOffset: 3 bytes // Index in city names array if len > STR_FIELD_LEN bytes
+     * len: 1 byte // Length of string, in bytes
+     * sum: 4 bytes
+     * count: 4 bytes
+     * max: 2 bytes
+     * min: 2 bytes
+     */
+    static class EntryData {
+
+        public static final int ENTRY_SIZE_BITS = 5;
+
+        /////////// OFFSETS ///////////////
+        private static final int OFFSET_STR = 0;
+        private static final int STR_FIELD_LEN = 14;
+        private static final int OFFSET_HASH = OFFSET_STR + STR_FIELD_LEN;
+        private static final int OFFSET_CITY_NAME_EXTRA = OFFSET_HASH + 2;
+        private static final int OFFSET_LEN = OFFSET_CITY_NAME_EXTRA + 3;
+        private static final int OFFSET_SUM = OFFSET_LEN + 1;
+        private static final int OFFSET_COUNT = OFFSET_SUM + 4;
+        private static final int OFFSET_MAX = OFFSET_COUNT + 4;
+        private static final int OFFSET_MIN = OFFSET_MAX + 2;
+
+        public static int strFieldLen() {
+            return STR_FIELD_LEN;
+        }
+
+        private final EntryMeta entryMeta;
+
+        private long baseAddress;
+
+        public EntryData(EntryMeta entryMeta) {
+            this.entryMeta = entryMeta;
+        }
+
+        public long baseAddress() {
+            return baseAddress;
+        }
+
+        public String cityNameString() {
+            int len = len();
+            byte[] zeBytes = new byte[len];
+
+            for (int i = 0; i < Math.min(len, strFieldLen()); i++) {
+                zeBytes[i] = Unsafely.readByte(baseAddress + i);
+            }
+
+            if (len > strFieldLen()) {
+                int rem = len - strFieldLen();
+                long ptr = entryMeta.cityNamesAddress(cityNamesOffset());
+                for (int i = 0; i < rem; i++) {
+                    zeBytes[strFieldLen() + i] = Unsafely.readByte(ptr + i);
+                }
+            }
+
+            return new String(zeBytes);
+        }
+
+        public int cityNamesOffset() {
+            return Unsafely.readInt(baseAddress + OFFSET_CITY_NAME_EXTRA) & 0xFFFFFF;
+        }
+
+        public int count() {
+            return Unsafely.readInt(baseAddress + OFFSET_COUNT);
+        }
+
+        public short hash16() {
+            return Unsafely.readShort(baseAddress + OFFSET_HASH);
+        }
+
+        public int index() {
+            return (int) ((baseAddress() - entryMeta.baseAddress(0)) >> ENTRY_SIZE_BITS);
+        }
+
+        public void init(long srcAddr, int len, short hash16, short temperature) {
+            // Copy the string
+            Unsafely.copyMemory(srcAddr, strAddress(), Math.min(len, EntryData.strFieldLen()));
+            if (len > EntryData.strFieldLen()) {
+                int remaining = len - EntryData.strFieldLen();
+                int cityNamesOffset = entryMeta.getAndIncrementCityNames(remaining);
+                Unsafely.copyMemory(
+                        srcAddr + EntryData.strFieldLen(),
+                        entryMeta.cityNamesAddress(cityNamesOffset),
+                        remaining);
+                setCityNameOffset(cityNamesOffset, len);
+            }
+            else {
+                setLen((byte) len);
+            }
+
+            // and then update the others
+            setHash16(hash16);
+            setSum(temperature);
+            setCount(1);
+            setMax(temperature);
+            setMin(temperature);
+        }
+
+        public boolean isPresent() {
+            return len() > 0;
+        }
+
+        public int len() {
+            return Unsafely.readByte(baseAddress + OFFSET_LEN);
+        }
+
+        public short max() {
+            return Unsafely.readShort(baseAddress + OFFSET_MAX);
+        }
+
+        public short min() {
+            return Unsafely.readShort(baseAddress + OFFSET_MIN);
+        }
+
+        public void setBaseAddress(long baseAddress) {
+            this.baseAddress = baseAddress;
+        }
+
+        public void setCityNameOffset(int cityNamesOffset, int len) {
+            // The 24 here is 3 bytes for Cityname extra index + 1 byte for actual len
+            // that writes 4 bytes in one shot. It is not an offset.
+            Unsafely.setInt(baseAddress + OFFSET_CITY_NAME_EXTRA, cityNamesOffset | (len << 24));
+        }
+
+        public void setCount(int value) {
+            Unsafely.setInt(baseAddress + OFFSET_COUNT, value);
+        }
+
+        public void setHash16(short value) {
+            Unsafely.setShort(baseAddress + OFFSET_HASH, value);
+        }
+
+        public void setIndex(int index) {
+            setBaseAddress(entryMeta.baseAddress(index));
+        }
+
+        public void setLen(byte value) {
+            Unsafely.setByte(baseAddress + OFFSET_LEN, value);
+        }
+
+        public void setMax(short value) {
+            Unsafely.setShort(baseAddress + OFFSET_MAX, value);
+        }
+
+        public void setMin(short value) {
+            Unsafely.setShort(baseAddress + OFFSET_MIN, value);
+        }
+
+        public void setSum(int value) {
+            Unsafely.setInt(baseAddress + OFFSET_SUM, value);
+        }
+
+        public Stat stat() {
+            return new Stat(min(), max(), sum(), count());
+        }
+
+        public long strAddress() {
+            return baseAddress + OFFSET_STR;
+        }
+
+        public int sum() {
+            return Unsafely.readInt(baseAddress + OFFSET_SUM);
+        }
+
+    public String toString() {
+      return STR."""
+        min = \{min()}
+        max = \{max()}
+        count = \{count()}
+        sum = \{sum()}
+        """;
+    }
+
+        public void update(short temperature) {
+            setMin((short) Math.min(min(), temperature));
+            setMax((short) Math.max(max(), temperature));
+            setCount(count() + 1);
+            setSum(sum() + temperature);
+        }
+
+        public boolean updateOnMatch(
+                                     EntryMeta entryMeta, long srcAddr, int len, short hash16, short temperature) {
+
+            // Quick paths
+            if (len() != len) {
+                return false;
+            }
+            if (hash16() != hash16) {
+                return false;
+            }
+
+            // Actual string comparison
+            if (len <= STR_FIELD_LEN) {
+                if (!Unsafely.matches(srcAddr, strAddress(), len)) {
+                    return false;
+                }
+            }
+            else {
+                if (!Unsafely.matches(srcAddr, strAddress(), STR_FIELD_LEN)) {
+                    return false;
+                }
+                if (!Unsafely.matches(
+                        srcAddr + STR_FIELD_LEN,
+                        entryMeta.cityNamesAddress(cityNamesOffset()),
+                        len - STR_FIELD_LEN)) {
+                    return false;
+                }
+            }
+            update(temperature);
+            return true;
+        }
+    }
+
+    /** Metadata for the collection of entries */
+    static class EntryMeta {
+
+        static int toIntFromUnsignedShort(short x) {
+            int ret = x;
+            if (ret < 0) {
+                ret += (1 << 16);
+            }
+            return ret;
+        }
+
+        private final long baseAddress;
+        private final long cityNamesBaseAddress; // For city names that overflow Entry.STR_FIELD_LEN
+        private final int hashMask;
+        private final int n_entries;
+        private final int n_entriesBits;
+        private long cityNamesEndAddress; // [cityNamesBaseAddress, cityNamesEndAddress)
+
+        EntryMeta(int n_entriesBits, EntryMeta oldEntryMeta) {
+            this.n_entries = 1 << n_entriesBits;
+            this.hashMask = (1 << n_entriesBits) - 1;
+            this.n_entriesBits = n_entriesBits;
+            this.baseAddress = Unsafely.allocateZeroedCacheLineAligned(this.n_entries << EntryData.ENTRY_SIZE_BITS);
+            if (oldEntryMeta == null) {
+                this.cityNamesBaseAddress = Unsafely.allocateZeroedCacheLineAligned(1 << 17);
+                this.cityNamesEndAddress = cityNamesBaseAddress;
+            }
+            else {
+                this.cityNamesBaseAddress = oldEntryMeta.cityNamesBaseAddress;
+                this.cityNamesEndAddress = oldEntryMeta.cityNamesEndAddress;
+            }
+        }
+
+        public long cityNamesAddress(int extraLenOffset) {
+            return cityNamesBaseAddress + extraLenOffset;
+        }
+
+        public int indexFromHash16(short hash16) {
+            return indexFromHash32(toIntFromUnsignedShort(hash16));
+        }
+
+        public int nEntriesBits() {
+            return n_entriesBits;
+        }
+
+        // Base Address of nth entry
+        long baseAddress(int n) {
+            return baseAddress + ((long) n << EntryData.ENTRY_SIZE_BITS);
+        }
+
+        // Size of each entry
+        int entrySizeInBytes() {
+            return 1 << EntryData.ENTRY_SIZE_BITS;
+        }
+
+        int getAndIncrementCityNames(int len) {
+            long ret = cityNamesEndAddress;
+            cityNamesEndAddress += ((len + 7) >> 3) << 3; // use aligned 8 bytes
+            return (int) (ret - cityNamesBaseAddress);
+        }
+
+        // Index of an entry with given hash32
+        int indexFromHash32(int hash32) {
+            return hash32 & hashMask;
+        }
+
+        // Number of entries
+        int nEntries() {
+            return n_entries;
+        }
+
+        int nextIndex(int index) {
+            return (index + 1) & hashMask;
+        }
+    }
+
+    static class Hashtable {
+
+        // State
+        int n_filledEntries;
+        // A single Entry to avoid local allocation
+        private EntryData entry;
+        private EntryMeta entryMeta;
+        // Invariants
+        // hash16 = (short) hash32
+        // index = hash16 & hashMask
+        private int hashHits = 0, hashMisses = 0;
+
+        Hashtable(int slotsBits) {
+            entryMeta = new EntryMeta(slotsBits, null);
+            this.entry = new EntryData(entryMeta);
+        }
+
+        public void addDataPoint(long srcAddr, int len, int hash32, short temperature) {
+            // hashHits++;
+            for (int index = entryMeta.indexFromHash32(hash32);; index = entryMeta.nextIndex(index)) {
+                entry.setIndex(index);
+
+                if (!entry.isPresent()) {
+                    entry.init(srcAddr, len, (short) hash32, temperature);
+                    onNewEntry();
+                    return;
+                }
+
+                if (entry.updateOnMatch(entryMeta, srcAddr, len, (short) hash32, temperature)) {
+                    return;
+                }
+                // hashMisses++;
+            }
+        }
+
+    public AggregateResult result() {
+      Map<String, Stat> map = new LinkedHashMap<>(5_000);
+      for (int i = 0; i < entryMeta.nEntries(); i++) {
+        entry.setIndex(i);
+        if (entry.isPresent()) {
+          map.put(entry.cityNameString(), entry.stat());
+        }
+      }
+      System.err.println(
+          STR."""
+        HashHits = \{hashHits}
+        HashMisses = \{hashMisses} (\{hashMisses * 100.0 / hashHits})
+        """);
+      return new AggregateResult(map);
+    }
+
+        private EntryData getNewEntry(EntryData oldEntry, EntryMeta newEntryMeta) {
+            EntryData newEntry = new EntryData(newEntryMeta);
+            for (int index = newEntryMeta.indexFromHash16(oldEntry.hash16());; index = newEntryMeta.nextIndex(index)) {
+                newEntry.setIndex(index);
+                if (!newEntry.isPresent()) {
+                    return newEntry;
+                }
+            }
+        }
+
+        private void onNewEntry() {
+            if (++n_filledEntries == 450) {
+                reHash(16);
+            }
+        }
+
+        private void reHash(int new_N_EntriesBits) {
+            EntryMeta oldEntryMeta = this.entryMeta;
+            EntryData oldEntry = new EntryData(oldEntryMeta);
+            Checks.checkArg(new_N_EntriesBits <= 16);
+            Checks.checkArg(new_N_EntriesBits > oldEntryMeta.nEntriesBits());
+            EntryMeta newEntryMeta = new EntryMeta(new_N_EntriesBits, oldEntryMeta);
+            for (int i = 0; i < oldEntryMeta.nEntries(); i++) {
+                oldEntry.setIndex(i);
+                if (oldEntry.isPresent()) {
+                    Unsafely.copyMemory(
+                            oldEntry.baseAddress(),
+                            getNewEntry(oldEntry, newEntryMeta).baseAddress(),
+                            oldEntryMeta.entrySizeInBytes());
+                }
+            }
+            this.entryMeta = newEntryMeta;
+            this.entry = new EntryData(this.entryMeta);
+        }
+    }
+
+    public interface LazyShardQueue {
+
+        void close(String closerId, int shardIdx);
+
+        Optional<ByteRange> fileTailEndWork(int idx);
+
+        ByteRange take(int shardIdx);
+    }
+
+    static final class Reflection {
+
+        static Method findMethodNamed(Object object, String name, Class... paramTypes) {
+            try {
+                return object.getClass().getMethod(name, paramTypes);
+            }
+            catch (NoSuchMethodException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        static Object invoke(Object receiver, Method method, Object... params) {
+            try {
+                return method.invoke(receiver, params);
+            }
+            catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    public static class Runner {
+
+        private final double commonChunkFraction;
+        private final int commonChunkSizeBits;
+        private final boolean fakeAdvance;
+        private final int hashtableSizeBits;
+        private final Path inputFile;
+        private final int minReservedBytesAtFileTail;
+        private final double munmapFraction;
+        private final int nThreads;
+        private final int shardSizeBits;
+
+        public Runner(
+                      Path inputFile,
+                      int nThreads,
+                      int chunkSizeBits,
+                      double commonChunkFraction,
+                      int commonChunkSizeBits,
+                      int hashtableSizeBits,
+                      int minReservedBytesAtFileTail,
+                      double munmapFraction,
+                      boolean fakeAdvance) {
+            this.inputFile = inputFile;
+            this.nThreads = nThreads;
+            this.shardSizeBits = chunkSizeBits;
+            this.commonChunkFraction = commonChunkFraction;
+            this.commonChunkSizeBits = commonChunkSizeBits;
+            this.hashtableSizeBits = hashtableSizeBits;
+            this.minReservedBytesAtFileTail = minReservedBytesAtFileTail;
+            this.munmapFraction = munmapFraction;
+            this.fakeAdvance = fakeAdvance;
+        }
+
+        AggregateResult getSummaryStatistics() throws Exception {
+            int nThreads = this.nThreads < 0 ? Runtime.getRuntime().availableProcessors() : this.nThreads;
+
+            LazyShardQueue shardQueue = new SerialLazyShardQueue(
+                    1L << shardSizeBits,
+                    inputFile,
+                    nThreads,
+                    commonChunkFraction,
+                    commonChunkSizeBits,
+                    minReservedBytesAtFileTail,
+                    munmapFraction,
+                    fakeAdvance);
+
+            ExecutorService executorService = Executors.newFixedThreadPool(
+                    nThreads,
+                    runnable -> {
+                        Thread thread = new Thread(runnable);
+                        thread.setDaemon(true);
+                        return thread;
+                    });
+
+            List<Future<AggregateResult>> results = new ArrayList<>();
+            for (int i = 0; i < nThreads; i++) {
+                final int shardIdx = i;
+                final Callable<AggregateResult> callable = () -> {
+                    Tracing.recordWorkStart("Shard", shardIdx);
+                    AggregateResult result = new ShardProcessor(shardQueue, hashtableSizeBits, shardIdx).processShard();
+                    Tracing.recordWorkEnd("Shard", shardIdx);
+                    return result;
+                };
+                results.add(executorService.submit(callable));
+            }
+            Tracing.recordEvent("Basic push time");
+
+            // This particular sequence of Futures is so that both merge and munmap() can work as shards
+            // finish their computation without blocking on the entire set of shards to complete. In
+            // particular, munmap() doesn't need to wait on merge.
+            // First, submit a task to merge the results and then submit a task to cleanup bytebuffers
+            // from completed shards.
+            Future<AggregateResult> resultFutures = executorService.submit(() -> merge(results));
+            // Note that munmap() is serial and not parallel and hence we use just one thread.
+            executorService.submit(() -> closeByteBuffers(results, shardQueue));
+
+            AggregateResult result = resultFutures.get();
+            Tracing.recordEvent("Merge results received");
+
+            Tracing.recordEvent("About to shutdown executor and wait");
+            executorService.shutdown();
+            executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
+            Tracing.recordEvent("Executor terminated");
+
+            Tracing.analyzeWorkThreads(nThreads);
+            return result;
+        }
+
+        private void closeByteBuffers(
+                                      List<Future<AggregateResult>> results, LazyShardQueue shardQueue) {
+            int n = results.size();
+            boolean[] isDone = new boolean[n];
+            int remaining = results.size();
+            while (remaining > 0) {
+                for (int i = 0; i < n; i++) {
+                    if (!isDone[i] && results.get(i).isDone()) {
+                        remaining--;
+                        isDone[i] = true;
+                        shardQueue.close("Ending Cleaner", i);
+                    }
+                }
+            }
+        }
+
+        private AggregateResult merge(List<Future<AggregateResult>> results)
+                throws ExecutionException, InterruptedException {
+            Tracing.recordEvent("Merge start time");
+            Map<String, Stat> output = null;
+            boolean[] isDone = new boolean[results.size()];
+            int remaining = results.size();
+            // Let's be naughty and spin in a busy loop
+            while (remaining > 0) {
+                for (int i = 0; i < results.size(); i++) {
+                    if (!isDone[i] && results.get(i).isDone()) {
+                        isDone[i] = true;
+                        remaining--;
+                        if (output == null) {
+                            output = new TreeMap<>(results.get(i).get().tempStats());
+                        }
+                        else {
+                            for (Entry<String, Stat> entry : results.get(i).get().tempStats().entrySet()) {
+                                output.compute(
+                                        entry.getKey(),
+                                        (key, value) -> value == null ? entry.getValue() : Stat.merge(value, entry.getValue()));
+                            }
+                        }
+                    }
+                }
+            }
+            Tracing.recordEvent("Merge end time");
+            return new AggregateResult(output);
+        }
+    }
+
+    public static class SerialLazyShardQueue implements LazyShardQueue {
+
+        private static long roundToNearestLowerMultipleOf(long divisor, long value) {
+            return value / divisor * divisor;
+        }
+
+        private final ByteRange[] byteRanges;
+        private final long chunkSize;
+        private final long commonChunkSize;
+        private final AtomicLong commonPool;
+        private final long effectiveFileSize;
+        private final boolean fakeAdvance;
+        private final long fileSize;
+        private final long[] perThreadData;
+        private final RandomAccessFile raf;
+        private final SeqLock seqLock;
+
+        public SerialLazyShardQueue(
+                                    long chunkSize,
+                                    Path filePath,
+                                    int shards,
+                                    double commonChunkFraction,
+                                    int commonChunkSizeBits,
+                                    int fileTailReservedBytes,
+                                    double munmapFraction,
+                                    boolean fakeAdvance)
+                throws IOException {
+            this.fakeAdvance = fakeAdvance;
+            Checks.checkArg(commonChunkFraction < 0.9 && commonChunkFraction >= 0);
+            Checks.checkArg(fileTailReservedBytes >= 0);
+            this.raf = new RandomAccessFile(filePath.toFile(), "r");
+            this.fileSize = raf.length();
+            fileTailReservedBytes = fileTailReservedBytes == 0
+                    ? 0
+                    : consumeToPreviousNewLineExclusive(raf, fileTailReservedBytes);
+            this.effectiveFileSize = fileSize - fileTailReservedBytes;
+
+            // Common pool
+            long commonPoolStart = Math.min(
+                    roundToNearestLowerMultipleOf(
+                            chunkSize, (long) (effectiveFileSize * (1 - commonChunkFraction))),
+                    effectiveFileSize);
+            this.commonPool = new AtomicLong(commonPoolStart);
+            this.commonChunkSize = 1L << commonChunkSizeBits;
+
+            // Distribute chunks to shards
+            this.perThreadData = new long[shards << 4]; // thread idx -> 16*idx to avoid cache line conflict
+            for (long i = 0,
+                    currentStart = 0,
+                    remainingChunks = (commonPoolStart + chunkSize - 1) / chunkSize; i < shards; i++) {
+                long remainingShards = shards - i;
+                long currentChunks = (remainingChunks + remainingShards - 1) / remainingShards;
+                // Shard i handles: [currentStart, currentStart + currentChunks * chunkSize)
+                int pos = (int) i << 4;
+                perThreadData[pos] = currentStart; // next chunk begin
+                perThreadData[pos + 1] = currentStart + currentChunks * chunkSize; // shard end
+                perThreadData[pos + 2] = currentChunks; // active chunks remaining
+                // threshold below which need to shrink
+                // 0.03 is a practical number but the optimal strategy is this:
+                // Shard number N (1-based) should unmap as soon as it completes (R/(R+1))^N fraction of
+                // its work, where R = relative speed of unmap compared to the computation.
+                // For our problem, R ~ 75 because unmap unmaps 30GB/sec (but, it is serial) while
+                // cores go through data at the rate of 400MB/sec.
+                perThreadData[pos + 3] = (long) (currentChunks * (munmapFraction * (shards - i)));
+                perThreadData[pos + 4] = 1; // true iff munmap() hasn't been triggered yet
+                currentStart += currentChunks * chunkSize;
+                remainingChunks -= currentChunks;
+            }
+            this.chunkSize = chunkSize;
+
+            this.byteRanges = new ByteRange[shards << 4];
+            for (int i = 0; i < shards; i++) {
+                byteRanges[i << 4] = new ByteRange(raf, effectiveFileSize, i);
+            }
+
+            this.seqLock = new SeqLock();
+        }
+
+        @Override
+        public void close(String closerId, int shardIdx) {
+            byteRanges[shardIdx << 4].close(closerId);
+        }
+
+        @Override
+        public Optional<ByteRange> fileTailEndWork(int idx) {
+            if (idx == 0 && effectiveFileSize < fileSize) {
+                ByteRange chunk = new ByteRange(raf, fileSize, 0);
+                chunk.setRange(
+                        effectiveFileSize == 0 ? 0 : effectiveFileSize - 1 /* will consume newline at eFS-1 */,
+                        fileSize);
+                return Optional.of(chunk);
+            }
+            return Optional.empty();
+        }
+
+        @Override
+        public ByteRange take(int shardIdx) {
+            // Try for thread local range
+            final int pos = shardIdx << 4;
+            final long rangeStart;
+            final long rangeEnd;
+
+            if (perThreadData[pos + 2] >= 1) {
+                rangeStart = perThreadData[pos];
+                rangeEnd = rangeStart + chunkSize;
+                // Don't do this in the if-check; it causes negative values that trigger intermediate
+                // cleanup
+                perThreadData[pos + 2]--;
+                if (!fakeAdvance) {
+                    perThreadData[pos] = rangeEnd;
+                }
+            }
+            else {
+                rangeStart = commonPool.getAndAdd(commonChunkSize);
+                // If that's exhausted too, nothing remains!
+                if (rangeStart >= effectiveFileSize) {
+                    return null;
+                }
+                rangeEnd = rangeStart + commonChunkSize;
+            }
+
+            if (perThreadData[pos + 2] < perThreadData[pos + 3] && perThreadData[pos + 4] > 0) {
+                if (attemptIntermediateClose(shardIdx)) {
+                    perThreadData[pos + 4]--;
+                }
+            }
+
+            ByteRange chunk = byteRanges[pos];
+            chunk.setRange(rangeStart, rangeEnd);
+            return chunk;
+        }
+
+        private boolean attemptIntermediateClose(int shardIdx) {
+            if (seqLock.acquire()) {
+                close("Intermediate Cleaner", shardIdx);
+                seqLock.release();
+                return true;
+            }
+            return false;
+        }
+
+        private int consumeToPreviousNewLineExclusive(RandomAccessFile raf, int minReservedBytes) {
+            try {
+                long pos = Math.max(raf.length() - minReservedBytes - 1, -1);
+                if (pos < 0) {
+                    return (int) raf.length();
+                }
+
+                long start = Math.max(pos - 512, 0);
+                ByteBuffer buf = raf.getChannel().map(MapMode.READ_ONLY, start, pos + 1 - start);
+                while (pos >= 0 && buf.get((int) (pos - start)) != '\n') {
+                    pos--;
+                }
+                pos++;
+                return (int) (raf.length() - pos);
+            }
+            catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    /** A low-traffic non-blocking lock. */
+    static class SeqLock {
+
+        private final AtomicBoolean isOccupied = new AtomicBoolean(false);
+
+        boolean acquire() {
+            return !isOccupied.get() && isOccupied.compareAndSet(false, true);
+        }
+
+        void release() {
+            isOccupied.set(false);
+        }
+    }
+
+    public static class ShardProcessor {
+
+        private final int shardIdx;
+        private final LazyShardQueue shardQueue;
+        private final FastShardProcessorState state;
+
+        public ShardProcessor(LazyShardQueue shardQueue, int hashtableSizeBits, int shardIdx) {
+            this.shardQueue = shardQueue;
+            this.shardIdx = shardIdx;
+            this.state = new FastShardProcessorState(hashtableSizeBits);
+        }
+
+        public AggregateResult processShard() {
+            return processShardReal();
+        }
+
+        private void processRange(ByteRange range) {
+            long nextPos = range.startAddress;
+            while (nextPos < range.endAddress) {
+                nextPos = state.processLine(nextPos);
+            }
+        }
+
+        private void processRangeSlow(ByteRange range) {
+            long nextPos = range.startAddress;
+            while (nextPos < range.endAddress) {
+                nextPos = state.processLineSlow(nextPos);
+            }
+        }
+
+        private AggregateResult processShardReal() {
+            // First process the file tail work to give ourselves freedom to go past ranges in parsing
+            shardQueue.fileTailEndWork(shardIdx).ifPresent(this::processRangeSlow);
+
+            ByteRange range;
+            while ((range = shardQueue.take(shardIdx)) != null) {
+                processRange(range);
+            }
+            return result();
+        }
+
+        private AggregateResult result() {
+            return state.result();
+        }
+    }
+
+    public static class FastShardProcessorState {
+
+        private static final long LEADING_ONE_BIT_MASK = 0x8080808080808080L;
+        private static final long ONE_MASK = 0x0101010101010101L;
+        private static final long SEMICOLON_MASK = 0x3b3b3b3b3b3b3b3bL;
+        private final Hashtable hashtable;
+        private final Map<String, Stat> slowProcessStats = new HashMap<>();
+
+        public FastShardProcessorState(int slotsBits) {
+            this.hashtable = new Hashtable(slotsBits);
+            Checks.checkArg(slotsBits <= 16); // since this.hashes is 'short'
+        }
+
+        public long processLine(long nextPos) {
+            final long origPos = nextPos;
+
+            // Trying to extract this into a function made it slower.. so, leaving it at inlining.
+            // It's a pity since the extracted version was more elegant to read
+            long firstLong;
+            int hash = 0;
+            // Don't run Long.numberOfTrailingZeros in hasSemiColon; it is not needed to establish
+            // whether there's a semicolon; only needed for pin-pointing length of the tail.
+            long s = hasSemicolon(firstLong = Unsafely.readLong(nextPos));
+            final int trailingZeroes;
+            if (s == 0) {
+                hash = doHash(firstLong);
+                do {
+                    nextPos += 8;
+                    s = hasSemicolon(Unsafely.readLong(nextPos));
+                } while (s == 0);
+                trailingZeroes = Long.numberOfTrailingZeros(s) + 1; // 8, 16, 24, .. # past ;
+            }
+            else {
+                trailingZeroes = Long.numberOfTrailingZeros(s) + 1; // 8, 16, 24, .. # past ;
+                hash = doHash(firstLong & maskOf(trailingZeroes - 8));
+            }
+            // Sometimes we do mix a tail of length 0..
+            nextPos += (trailingZeroes >> 3);
+
+            final int temp = readTemperature(nextPos);
+            hashtable.addDataPoint(origPos, (int) (nextPos - 1 - origPos), hash, (short) (temp >> 3));
+            return nextPos + (temp & 7);
+        }
+
+        /**
+         * A slow version which is used only for the tail part of the file. Maintaining hashcode sync
+         * between this and the fast version is a pain for experimentation. So, we'll simply use a naive
+         * approach.
+         */
+        public long processLineSlow(long nextPos) {
+            byte nextByte;
+            ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            while ((nextByte = Unsafely.readByte(nextPos++)) != ';') {
+                baos.write(nextByte);
+            }
+
+            int temperature = 0;
+            boolean negative = Unsafely.readByte(nextPos) == '-';
+            while ((nextByte = Unsafely.readByte(nextPos++)) != '\n') {
+                if (nextByte != '-' && nextByte != '.') {
+                    temperature = temperature * 10 + (nextByte - '0');
+                }
+            }
+            if (negative) {
+                temperature = -temperature;
+            }
+
+            updateStat(slowProcessStats, baos.toString(), Stat.firstReading(temperature));
+            return nextPos;
+        }
+
+        public AggregateResult result() {
+            AggregateResult result = hashtable.result();
+            if (!slowProcessStats.isEmpty()) {
+                // bah.. just mutate the arg of the record...
+                for (Entry<String, Stat> entry : slowProcessStats.entrySet()) {
+                    updateStat(result.tempStats(), entry.getKey(), entry.getValue());
+                }
+            }
+            return result;
+        }
+
+        int readTemperature(long nextPos) {
+            // This Dependency chain
+            // read -> shift -> xor -> compare -> 2 in parallel [ shift -> read ] -> add -> shift
+            // Chain latency: 2 reads + 2 add + 4 logical [assuming compare = add]
+            // vs
+            // Prior Dependency chain (slightly optimized by hand)
+            // read -> compare to '-' -> read -> compare to '.' -> 3 in parallel [read -> imul] -> add
+            // Chain latency: 3 reads + 3 add + 1 mul [assuming compare = add]
+            long data = Unsafely.readLong(nextPos);
+            long d = data ^ (data >> 4);
+            if ((data & 0xFF) == '-') {
+                return TemperatureLookup.firstNeg(d >> 8) + TemperatureLookup.secondNeg(d >> 24);
+            }
+            else {
+                return TemperatureLookup.firstPos(d >> 0) + TemperatureLookup.secondPos(d >> 16);
+            }
+        }
+
+        private int doHash(long value) {
+            long hash = 31L * (int) value + (int) (value >> 32);
+            return (int) (hash ^ (hash >> 17) ^ (hash >> 28));
+        }
+
+        private long hasSemicolon(long x) {
+            long a = (x ^ SEMICOLON_MASK);
+            return (a - ONE_MASK) & (~a) & LEADING_ONE_BIT_MASK;
+        }
+
+        private long maskOf(int bits) {
+            return ~(-1L << bits);
+        }
+
+        private void updateStat(Map<String, Stat> map, String key, Stat curValue) {
+            map.compute(key, (_, value) -> value == null ? curValue : Stat.merge(value, curValue));
+        }
+    }
+
+    /** Represents aggregate stats. */
+    public static class Stat {
+
+        public static Stat firstReading(int temp) {
+            return new Stat(temp, temp, temp, 1);
+        }
+
+        public static Stat merge(Stat left, Stat right) {
+            return new Stat(
+                    Math.min(left.min, right.min),
+                    Math.max(left.max, right.max),
+                    left.sum + right.sum,
+                    left.count + right.count);
+        }
+
+        long count, sum;
+        int min, max;
+
+        public Stat(int min, int max, long sum, long count) {
+            this.min = min;
+            this.max = max;
+            this.sum = sum;
+            this.count = count;
+        }
+
+        // Caution: Mutates
+        public Stat mergeReading(int curTemp) {
+            // Can this be improved furhter?
+            // Assuming random values for curTemp,
+            // min (&max) gets updated roughly log(N)/N fraction of the time (a small number)
+            // In the worst case, there will be at-most one branch misprediction.
+            if (curTemp > min) { // Mostly passes. On branch misprediction, just update min.
+                if (curTemp > max) { // Mostly fails. On branch misprediction, just update max.
+                    max = curTemp;
+                }
+            }
+            else {
+                min = curTemp;
+            }
+            sum += curTemp;
+            count++;
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            return "%.1f/%.1f/%.1f".formatted(min / 10.0, sum / 10.0 / count, max / 10.0);
+        }
+    }
+
+    /**
+     * Lookup table for temperature parsing.
+     *
+     * <pre>
+     * 0       0011-0000
+     * 9       0011-1001
+     * .       0010-1110
+     * \n      0000-1010
+     *
+     * Notice that there's no overlap in the last 4 bits. This means, if we are given two successive
+     * bytes X, Y all of which belong to the above characters, we can REVERSIBLY hash it to
+     * a single byte by doing 8-bit-hash = (last 4 bits of X) concat (last 4 bits of Y).
+     *
+     * Such a hash requires a few more operations than ideal. A more practical hash is:
+     * (X>>4) ^ Y ^ (Y >> 4). This means if you read 4 bytes after the '-',
+     * L = X Y Z W, where each of X Y Z W is a byte, then,
+     * L ^ (L >> 4) = D hash(X, Y) hash(Y, Z) hash(Z, W) where D = don't care. In other words, we
+     * can SWAR the hash.
+     *
+     * This has potential for minor conflicts; for e.g. (3, NewLine) collides with (0, 9). But, we
+     * don't have any collisions between two digits. That is (x, y) will never collide with (a, b)
+     * where x, y, a, b are digits (proof left as an exercise, lol). Along with a couple of other
+     * such no-conflict observations, it suffices for our purposes.
+     *
+     * If we just precompute some values like
+     * - BigTable[hash(X,Y)] = 100*X + 10*Y
+     * - SmallTable[hash(Z,W)] = 10*Z + W
+     *
+     * where potentially X, Y, Z, W can be '.' or '\n', (and the arithmetic adjusted), we can lookup
+     * the temperature pieces from BigTable and SmallTable and add them together.
+     * </pre>
+     *
+     * <p>This class is an implementation of the above idea. The lookup tables being 256 ints long
+     * will always be resident in L1 cache. What remains then is to also add the information on how
+     * much input is to be consumed; i.e. count the - and newlines too. That can be piggy backed on
+     * top of the values.
+     *
+     * <p>FWIW, this lookup appears to have reduced the temperature reading overhead substantially on
+     * a Ryzen 7950X machine. But, it wasn't done systematically; so, YMMV.
+     */
+    public static class TemperatureLookup {
+
+        // Second is the smaller (units place)
+        // First is the larger (100 & 10)
+
+        // _NEG tables simply negate the value so that call-site can always simply add the values from
+        // the first and second units. Call-sites adding-up First and Second units adds up the
+        // amount of input to consume.
+
+        // Here, 2 is the amount of bytes consumed. This informs how much the reading pointer
+        // should move.
+        // For pattern XY value = ((-100*X -10*Y) << 3) + 2 [2 = 1 for X, 1 for Y]
+        // For pattern Y. value = ((-10*Y) << 3) + 2 [2 = 1 for Y, 1 for .]
+        private static final int[] FIRST_NEG = make(true, true);
+
+        // For pattern XY value = ((100*X + 10*Y) << 3) + 2
+        // For pattern Y. value = ((10*Y) << 3) + 2
+        private static final int[] FIRST_POS = make(true, false);
+
+        // We count newline and any initial '-' as part of SECOND
+        // For pattern .Z value = (-Z << 3) + 2 + 2 [1 each for . and Z, 1 for newline, 1 for minus]
+        // For pattern Zn value = (-Z << 3) + 1 + 2 [1 for Z, 1 for newline, 1 for minus]
+        private static final int[] SECOND_NEG = make(false, true);
+
+        // For pattern .Z value = (Z << 3) + 2 + 1 [1 each for . and Z, 1 for newline]
+        // For pattern Zn value = (Z << 3) + 1 + 1 [1 for Z, 1 for newline]
+        private static final int[] SECOND_POS = make(false, false);
+
+        public static int firstNeg(long b) {
+            return FIRST_NEG[(int) (b & 255)];
+        }
+
+        public static int firstPos(long b) {
+            return FIRST_POS[(int) (b & 255)];
+        }
+
+        public static int secondNeg(long b) {
+            return SECOND_NEG[(int) (b & 255)];
+        }
+
+        public static int secondPos(long b) {
+            return SECOND_POS[(int) (b & 255)];
+        }
+
+        private static byte[] allDigits() {
+            byte[] out = new byte[10];
+            for (byte a = '0'; a <= '9'; a++) {
+                out[a - '0'] = a;
+            }
+            return out;
+        }
+
+        private static int hash(byte msb, byte lsb) {
+            // If K = [D msb lsb], then (K ^ (K>>4)) & 255 == hash(msb, lsb). D = don't care
+            return (msb << 4) ^ lsb ^ (lsb >> 4);
+        }
+
+        private static int[] make(boolean isFirst, boolean isNegative) {
+            int[] ret = new int[256];
+            boolean[] done = new boolean[256];
+
+            // Conventions: X = 100s place, Y = 10s place, Z = 1s place, n = new line
+
+            // All the cases to handle
+            // X Y . Z
+            // Y . Z n
+
+            // In little-endian order it becomes (byte-wise), shown in place value notation
+            // Z . Y X
+            // n Z . Y
+            // First = YX or .Y
+            // Second = Z. or nZ
+
+            // Pattern 'YX'
+            for (byte x : allDigits()) {
+                for (byte y : allDigits()) {
+                    int index = hash(y, x);
+                    // Shouldn't occur in Second
+                    int value = isFirst ? (y - '0') * 10 + (x - '0') * 100 : 12345;
+                    int delta = isFirst ? 2 : 12345;
+                    update(index, isNegative ? -value : value, delta, ret, done);
+                }
+            }
+
+            // Pattern 'Z.'
+            for (byte z : allDigits()) {
+                int index = hash(z, (byte) '.');
+                // shouldn't occur in First
+                int value = isFirst ? 12345 : (z - '0');
+                int delta = isFirst ? 12345 : 2;
+                update(index, isNegative ? -value : value, delta, ret, done);
+            }
+
+            // Pattern '.Y'
+            for (byte y : allDigits()) {
+                int index = hash((byte) '.', y);
+                // Shouldn't occur in Second
+                int value = isFirst ? 10 * (y - '0') : 12345;
+                int delta = isFirst ? 2 : 12345;
+                update(index, isNegative ? -value : value, delta, ret, done);
+            }
+
+            // Pattern 'nZ'
+            for (byte z : allDigits()) {
+                int index = hash((byte) '\n', z);
+                // shouldn't occur in First
+                int value = isFirst ? 12345 : (z - '0');
+                int delta = isFirst ? 12345 : 1;
+                update(index, isNegative ? -value : value, delta, ret, done);
+            }
+
+            if (!isFirst) {
+                // Adjust the deltas to reflect how much input needs to be consumed
+                // need to consume the newline and any - sign in front
+                for (int i = 0; i < ret.length; i++) {
+                    ret[i] += (isNegative ? 1 : 0) /* for - sign */ + 1 /* for new line */;
+                }
+            }
+            return ret;
+        }
+
+        private static void update(int index, int value, int delta, int[] ret, boolean[] done) {
+            index &= 255;
+            Checks.checkArg(!done[index]); // just a sanity check that our hashing is indeed reversible
+            ret[index] = (value << 3) | delta;
+            done[index] = true;
+        }
+    }
+
+    static class Tracing {
+
+        private static final Map<String, ThreadTimingsArray> knownWorkThreadEvents;
+        private static long startTime;
+
+        static {
+            // Maintain the ordering to be chronological in execution
+            // Map.of(..) screws up ordering
+            knownWorkThreadEvents = new LinkedHashMap<>();
+            for (String id : List.of("Shard", "Intermediate Cleaner", "Ending Cleaner", "Buffer Creation")) {
+                knownWorkThreadEvents.put(id, new ThreadTimingsArray(id, 1 << 10));
+            }
+        }
+
+        static void analyzeWorkThreads(int nThreads) {
+            for (ThreadTimingsArray array : knownWorkThreadEvents.values()) {
+                errPrint(array.analyze(nThreads));
+            }
+        }
+
+        static void recordAppStart() {
+            startTime = System.nanoTime();
+            printEvent("Start time", startTime);
+        }
+
+        static void recordEvent(String event) {
+            printEvent(event, System.nanoTime());
+        }
+
+        static void recordWorkEnd(String id, int threadId) {
+            knownWorkThreadEvents.get(id).recordEnd(threadId);
+        }
+
+        static void recordWorkStart(String id, int threadId) {
+            knownWorkThreadEvents.get(id).recordStart(threadId);
+        }
+
+        /////////////////////////////////////////////////////////////////////////////////////////////////
+
+        private static void errPrint(String message) {
+            System.err.println(message);
+        }
+
+    private static void printEvent(String message, long nanoTime) {
+      errPrint(STR."\{message} = \{(nanoTime - startTime) / 1_000_000}ms");
+    }
+
+        public static class ThreadTimingsArray {
+
+            private static String toString(long[] array) {
+                return Arrays.stream(array)
+                        .map(x -> x < 0 ? -1 : x)
+                        .mapToObj(x -> String.format("%6d", x))
+                        .collect(Collectors.joining(", ", "[ ", " ]"));
+            }
+
+            private final String id;
+            private final long[] timestamps;
+            private boolean hasData = false;
+
+            public ThreadTimingsArray(String id, int maxSize) {
+                this.timestamps = new long[maxSize];
+                this.id = id;
+            }
+
+      public String analyze(int nThreads) {
+        if (!hasData) {
+          return "%s has no thread timings data".formatted(id);
+        }
+        Checks.checkArg(nThreads <= timestamps.length);
+        long minDuration = Long.MAX_VALUE, maxDuration = Long.MIN_VALUE;
+        long minBegin = Long.MAX_VALUE, maxCompletion = Long.MIN_VALUE;
+        long maxBegin = Long.MIN_VALUE, minCompletion = Long.MAX_VALUE;
+
+        long[] durationsMs = new long[nThreads];
+        long[] completionsMs = new long[nThreads];
+        long[] beginMs = new long[nThreads];
+        for (int i = 0; i < nThreads; i++) {
+          long durationNs = timestamps[2 * i + 1] - timestamps[2 * i];
+          durationsMs[i] = durationNs / 1_000_000;
+          completionsMs[i] = (timestamps[2 * i + 1] - startTime) / 1_000_000;
+          beginMs[i] = (timestamps[2 * i] - startTime) / 1_000_000;
+
+          minDuration = Math.min(minDuration, durationNs);
+          maxDuration = Math.max(maxDuration, durationNs);
+
+          minBegin = Math.min(minBegin, timestamps[2 * i] - startTime);
+          maxBegin = Math.max(maxBegin, timestamps[2 * i] - startTime);
+
+          maxCompletion = Math.max(maxCompletion, timestamps[2 * i + 1] - startTime);
+          minCompletion = Math.min(minCompletion, timestamps[2 * i + 1] - startTime);
+        }
+        return STR."""
+        -------------------------------------------------------------------------------------------
+                                       \{id} Stats
+        -------------------------------------------------------------------------------------------
+        Max duration                              = \{maxDuration / 1_000_000} ms
+        Min duration                              = \{minDuration / 1_000_000} ms
+        Timespan[max(end)-min(start)]             = \{(maxCompletion - minBegin) / 1_000_000} ms [\{maxCompletion / 1_000_000} - \{minBegin / 1_000_000} ]
+        Completion Timespan[max(end)-min(end)]    = \{(maxCompletion - minCompletion) / 1_000_000} ms
+        Begin Timespan[max(begin)-min(begin)]     = \{(maxBegin - minBegin) / 1_000_000} ms
+        Average Duration                          = \{Arrays.stream(durationsMs)
+                                                            .average()
+                                                            .getAsDouble()} ms
+        Durations                                 = \{toString(durationsMs)} ms
+        Begin Timestamps                          = \{toString(beginMs)} ms
+        Completion Timestamps                     = \{toString(completionsMs)} ms
+        """;
+      }
+
+            public void recordEnd(int idx) {
+                timestamps[2 * idx + 1] = System.nanoTime();
+                hasData = true;
+            }
+
+            public void recordStart(int idx) {
+                timestamps[2 * idx] = System.nanoTime();
+                hasData = true;
+            }
+        }
+    }
+
+    static class Unsafely {
+
+        private static final Unsafe unsafe = getUnsafe();
+
+        public static long allocateZeroedCacheLineAligned(int size) {
+            long address = unsafe.allocateMemory(size + 63);
+            unsafe.setMemory(address, size + 63, (byte) 0);
+            return (address + 63) & ~63;
+        }
+
+        public static void copyMemory(long srcAddress, long destAddress, long byteCount) {
+            unsafe.copyMemory(srcAddress, destAddress, byteCount);
+        }
+
+        public static boolean matches(long srcAddr, long destAddress, int len) {
+            if (len < 8) {
+                return (readLong(srcAddr) & ~(-1L << (len << 3))) == (readLong(destAddress) & ~(-1L << (len << 3)));
+            }
+            if (readLong(srcAddr) != readLong(destAddress)) {
+                return false;
+            }
+            len -= 8;
+
+            if (len < 8) {
+                return (readLong(srcAddr + 8) & ~(-1L << (len << 3))) == (readLong(destAddress + 8) & ~(-1L << (len << 3)));
+            }
+            if (readLong(srcAddr + 8) != readLong(destAddress + 8)) {
+                return false;
+            }
+            len -= 8;
+            srcAddr += 16;
+            destAddress += 16;
+
+            int idx = 0;
+            for (; idx < (len & ~7); idx += 8) {
+                if (Unsafely.readLong(srcAddr + idx) != Unsafely.readLong(destAddress + idx)) {
+                    return false;
+                }
+            }
+
+            if (idx < (len & ~3)) {
+                if (Unsafely.readInt(srcAddr + idx) != Unsafely.readInt(destAddress + idx)) {
+                    return false;
+                }
+                idx += 4;
+            }
+
+            if (idx < (len & ~1)) {
+                if (Unsafely.readShort(srcAddr + idx) != Unsafely.readShort(destAddress + idx)) {
+                    return false;
+                }
+                idx += 2;
+            }
+
+            return idx >= len || Unsafely.readByte(srcAddr + idx) == Unsafely.readByte(destAddress + idx);
+        }
+
+        public static byte readByte(long address) {
+            return unsafe.getByte(address);
+        }
+
+        public static int readInt(long address) {
+            return unsafe.getInt(address);
+        }
+
+        public static long readLong(long address) {
+            return unsafe.getLong(address);
+        }
+
+        public static short readShort(long address) {
+            return unsafe.getShort(address);
+        }
+
+        public static void setByte(long address, byte len) {
+            unsafe.putByte(address, len);
+        }
+
+        public static void setInt(long address, int value) {
+            unsafe.putInt(address, value);
+        }
+
+        public static void setShort(long address, short len) {
+            unsafe.putShort(address, len);
+        }
+
+        private static Unsafe getUnsafe() {
+            try {
+                Field unsafeField = Unsafe.class.getDeclaredField("theUnsafe");
+                unsafeField.setAccessible(true);
+                return (Unsafe) unsafeField.get(null);
+            }
+            catch (NoSuchFieldException | IllegalAccessException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+}

From ce9455a584413b575a2eb23633eb92bb415a0618 Mon Sep 17 00:00:00 2001
From: gabrielfoo <62894711+gabrielfoo@users.noreply.github.com>
Date: Fri, 26 Jan 2024 05:46:40 +0800
Subject: [PATCH 128/268] gabrielfoo's first attempt (#556)

* first attempt

* formatting fix

---------

Co-authored-by: Gabriel <gabriel@gabriel>
---
 calculate_average_gabrielfoo.sh               |  23 +++
 prepare_gabrielfoo.sh                         |  19 ++
 .../onebrc/CalculateAverage_gabrielfoo.java   | 180 ++++++++++++++++++
 3 files changed, 222 insertions(+)
 create mode 100755 calculate_average_gabrielfoo.sh
 create mode 100755 prepare_gabrielfoo.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java

diff --git a/calculate_average_gabrielfoo.sh b/calculate_average_gabrielfoo.sh
new file mode 100755
index 000000000..bc684dfd8
--- /dev/null
+++ b/calculate_average_gabrielfoo.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xmx64m"
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions"
+JAVA_OPTS="$JAVA_OPTS -XX:+AlwaysPreTouch"
+JAVA_OPTS="$JAVA_OPTS -XX:+TrustFinalNonStaticFields -XX:InlineSmallCode=10000"
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:CICompilerCount=2 -XX:CompileThreshold=1000"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gabrielfoo
\ No newline at end of file
diff --git a/prepare_gabrielfoo.sh b/prepare_gabrielfoo.sh
new file mode 100755
index 000000000..e19dea509
--- /dev/null
+++ b/prepare_gabrielfoo.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal > /dev/null 2>&1
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java
new file mode 100644
index 000000000..35e8bb36f
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java
@@ -0,0 +1,180 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.RandomAccessFile;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.TreeMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.ThreadFactory;
+
+public class CalculateAverage_gabrielfoo {
+    private static final String FILE = "./measurements.txt";
+    private static final int UTF8_MAX_LEN_100_BYTES = 400;
+    private static final int DOUBLE_DIGITS_MAX = 3;
+    private static final int UNIQUE_STATION_NAMES = 10000;
+
+    private static class ResultRow {
+        private double min = Double.POSITIVE_INFINITY;
+        private double sum = 0.0;
+        private double max = Double.NEGATIVE_INFINITY;
+        private int count = 0;
+
+        public String toString() {
+            return min + "/" + (Math.round(sum / count) / 10.0) + "/" + max;
+        }
+
+        public void updateMinMax(double incoming) {
+            min = Math.min(min, incoming);
+            max = Math.max(max, incoming);
+            sum += incoming * 10.0;
+            count += 1;
+        }
+
+        public void combine(ResultRow other) {
+            min = Math.min(min, other.min);
+            max = Math.max(max, other.max);
+            sum += other.sum;
+            count += other.count;
+        }
+    }
+
+    public static MappedByteBuffer[] mapFileToMemory(final RandomAccessFile file, final int chunkCount) throws Exception {
+        FileChannel channel = file.getChannel();
+        final long chunkSize = Math.ceilDiv(file.length(), chunkCount);
+
+        MappedByteBuffer buffers[] = new MappedByteBuffer[chunkCount];
+
+        long position = 0;
+        for (int i = 0; i < chunkCount - 1; ++i) {
+            file.seek(position + chunkSize);
+            long ptr = file.getFilePointer();
+
+            while (file.readByte() != '\n') {
+                file.seek(++ptr);
+            }
+
+            buffers[i] = channel.map(FileChannel.MapMode.READ_ONLY, position, ptr - position + 1);
+
+            position = ptr + 1;
+        }
+
+        buffers[buffers.length - 1] = channel.map(FileChannel.MapMode.READ_ONLY, position, file.length() - position);
+
+        return buffers;
+    }
+
+    public static void main(String[] args) throws Exception {
+        final RandomAccessFile file = new RandomAccessFile(FILE, "r");
+        final int coreCount = file.length() < 2147483647 ? 1 : Runtime.getRuntime().availableProcessors();
+        ArrayList<HashMap<String, ResultRow>> maps = new ArrayList<>();
+
+        final ThreadFactory threadFactory = new ThreadFactory() {
+            public Thread newThread(Runnable r) {
+                Thread t = new Thread(r);
+                t.setPriority(Thread.MAX_PRIORITY);
+                return t;
+            }
+        };
+        ExecutorService executor = Executors.newFixedThreadPool(coreCount, threadFactory);
+
+        Future<?> initFuture = executor.submit(() -> {
+            for (int i = 0; i < coreCount; ++i) {
+                maps.add(new HashMap<>(UNIQUE_STATION_NAMES, 0.9f));
+            }
+        });
+
+        MappedByteBuffer[] buffers = mapFileToMemory(file, coreCount);
+        initFuture.get();
+
+        Future<?>[] futures = new Future<?>[buffers.length];
+
+        for (int k = 0; k < buffers.length; ++k) {
+            final MappedByteBuffer buffer = buffers[k];
+            final var map = maps.get(k);
+            futures[k] = executor.submit(() -> {
+                int start = 0;
+                byte[] stationArr = new byte[UTF8_MAX_LEN_100_BYTES];
+                double[] floatArr = new double[DOUBLE_DIGITS_MAX];
+                byte currentByte;
+
+                while (buffer.hasRemaining()) {
+                    currentByte = buffer.get();
+                    stationArr[buffer.position() - start - 1] = currentByte;
+
+                    if (currentByte == ';') {
+                        final int stationEnd = buffer.position() - 1;
+                        // convert to double now
+                        currentByte = buffer.get();
+                        boolean neg = currentByte == '-';
+                        if (neg)
+                            currentByte = buffer.get();
+                        floatArr[0] = currentByte - '0';
+                        currentByte = buffer.get();
+                        if (currentByte == '.') {
+                            floatArr[1] = (buffer.get() - '0') / 10.0;
+                            floatArr[2] = 0.0;
+                        }
+                        else {
+                            floatArr[0] *= 10.0;
+                            floatArr[1] = (currentByte - '0');
+                            buffer.get();
+                            floatArr[2] = (buffer.get() - '0') / 10.0;
+                        }
+                        final double f = (neg ? -1 : 1) * (floatArr[0] + floatArr[1] + floatArr[2]);
+
+                        buffer.get(); // discard \n
+
+                        String station = new String(stationArr, 0, stationEnd - start);
+
+                        map.compute(station, (key, existingRow) -> {
+                            ResultRow row = (existingRow == null) ? new ResultRow() : existingRow;
+                            row.updateMinMax(f);
+                            return row;
+                        });
+
+                        start = buffer.position();
+                    }
+                }
+
+            });
+        }
+
+        for (Future<?> future : futures) {
+            future.get();
+        }
+
+        HashMap<String, ResultRow> resultHashMap = maps.get(0);
+
+        maps.stream().skip(1).flatMap(map -> map.entrySet().stream()).forEach(entry -> {
+            resultHashMap.merge(entry.getKey(), entry.getValue(), (oldVal, newVal) -> {
+                oldVal.combine(newVal);
+                return oldVal;
+            });
+        });
+
+        TreeMap<String, ResultRow> res = new TreeMap<>(resultHashMap);
+
+        executor.shutdown();
+
+        System.out.println(res);
+    }
+}

From 271bdfb0329df636988455e450bb48a45f5b917f Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Fri, 26 Jan 2024 06:57:04 +0900
Subject: [PATCH 129/268] Simplify Node class with less field, improve hash mix
 speed (#584)

* Simplify Node class with less field, improve hash mix speed

* remove some ops, a bit faster

* more inline, little bit faster but not sure
---
 prepare_abeobk.sh                             |   4 +-
 .../onebrc/CalculateAverage_abeobk.java       | 126 +++++++++---------
 2 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/prepare_abeobk.sh b/prepare_abeobk.sh
index d8ed86a1a..1b7374383 100755
--- a/prepare_abeobk.sh
+++ b/prepare_abeobk.sh
@@ -20,6 +20,8 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_abeobk_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=128m -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -dsa -march=native -R:MaxHeapSize=128m -H:-GenLoopSafepoints -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
 fi
+
+
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index 293a88caf..ed859f3df 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -39,6 +39,7 @@ public class CalculateAverage_abeobk {
     private static final int BUCKET_SIZE = 1 << 16;
     private static final int BUCKET_MASK = BUCKET_SIZE - 1;
     private static final int MAX_STR_LEN = 100;
+    private static final int MAX_STATIONS = 10000;
     private static final Unsafe UNSAFE = initUnsafe();
     private static final long[] HASH_MASKS = new long[]{
             0x0L,
@@ -66,6 +67,33 @@ private static Unsafe initUnsafe() {
         }
     }
 
+    static class Stat {
+        Node node;
+        String key;
+
+        public final String toString() {
+            return (node.min / 10.0) + "/"
+                    + (Math.round(((double) node.sum / node.count)) / 10.0) + "/"
+                    + (node.max / 10.0);
+        }
+
+        Stat(Node n) {
+            node = n;
+            byte[] sbuf = new byte[MAX_STR_LEN];
+            long word = UNSAFE.getLong(n.addr);
+            long semipos_code = getSemiPosCode(word);
+            int keylen = 0;
+            while (semipos_code == 0) {
+                keylen += 8;
+                word = UNSAFE.getLong(n.addr + keylen);
+                semipos_code = getSemiPosCode(word);
+            }
+            keylen += Long.numberOfTrailingZeros(semipos_code) >>> 3;
+            UNSAFE.copyMemory(null, n.addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
+            key = new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
+        }
+    }
+
     static class Node {
         long addr;
         long word0;
@@ -73,37 +101,23 @@ static class Node {
         long sum;
         int count;
         short min, max;
-        int keylen;
-        String key;
 
-        void calcKey() {
-            byte[] sbuf = new byte[MAX_STR_LEN];
-            UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
-            key = new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
-        }
-
-        public String toString() {
-            return String.format("%.1f/%.1f/%.1f", min * 0.1, sum * 0.1 / count, max * 0.1);
-        }
-
-        Node(long a, long t, short val, int kl) {
+        Node(long a, long t, short val) {
             addr = a;
             tail = t;
-            keylen = kl;
             sum = min = max = val;
             count = 1;
         }
 
-        Node(long a, long w0, long t, short val, int kl) {
+        Node(long a, long w0, long t, short val) {
             addr = a;
             word0 = w0;
             tail = t;
-            keylen = kl;
             sum = min = max = val;
             count = 1;
         }
 
-        void add(short val) {
+        final void add(short val) {
             sum += val;
             count++;
             if (val >= max) {
@@ -115,7 +129,7 @@ void add(short val) {
             }
         }
 
-        void merge(Node other) {
+        final void merge(Node other) {
             sum += other.sum;
             count += other.count;
             if (other.max > max) {
@@ -126,8 +140,8 @@ void merge(Node other) {
             }
         }
 
-        boolean contentEquals(long other_addr, long other_word0, long other_tail) {
-            if (tail != other_tail || word0 != other_word0)
+        final boolean contentEquals(long other_addr, long other_word0, long other_tail, int keylen) {
+            if (word0 != other_word0 || tail != other_tail)
                 return false;
             // this is faster than comparision if key is short
             long xsum = 0;
@@ -161,11 +175,8 @@ static final long getSemiPosCode(final long word) {
 
     // speed/collision balance
     static final int xxh32(long hash) {
-        final int p1 = 0x85EBCA77; // prime
-        int low = (int) hash;
-        int high = (int) (hash >>> 33);
-        int h = (low * p1) ^ high;
-        return h ^ (h >>> 17);
+        long h = hash * 37;
+        return (int) (h ^ (h >>> 29));
     }
 
     // great idea from merykitty (Quan Anh Mai)
@@ -185,11 +196,10 @@ static final short parseNum(long num_word, int dot_pos) {
     static final Node[] parse(int thread_id, long start, long end) {
         int cls = 0;
         long addr = start;
-        var map = new Node[BUCKET_SIZE + 10000]; // extra space for collisions
+        var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
         // parse loop
         while (addr < end) {
             long row_addr = addr;
-            long hash = 0;
 
             long word0 = UNSAFE.getLong(addr);
             long semipos_code = getSemiPosCode(word0);
@@ -202,14 +212,14 @@ static final Node[] parse(int thread_id, long start, long end) {
                 int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                 addr += (dot_pos >>> 3) + 3;
 
-                long tail = (word0 & HASH_MASKS[semi_pos]);
+                long tail = word0 & HASH_MASKS[semi_pos];
                 int bucket = xxh32(tail) & BUCKET_MASK;
                 short val = parseNum(num_word, dot_pos);
 
                 while (true) {
                     var node = map[bucket];
                     if (node == null) {
-                        map[bucket] = new Node(row_addr, tail, val, semi_pos);
+                        map[bucket] = new Node(row_addr, tail, val);
                         break;
                     }
                     if (node.tail == tail) {
@@ -223,28 +233,25 @@ static final Node[] parse(int thread_id, long start, long end) {
                 continue;
             }
 
-            hash ^= word0;
             addr += 8;
             long word = UNSAFE.getLong(addr);
             semipos_code = getSemiPosCode(word);
             // 43% chance
             if (semipos_code != 0) {
                 int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos;
-                int keylen = (int) (addr - row_addr);
-                long num_word = UNSAFE.getLong(addr + 1);
+                addr += semi_pos + 1;
+                long num_word = UNSAFE.getLong(addr);
                 int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                addr += (dot_pos >>> 3) + 4;
+                addr += (dot_pos >>> 3) + 3;
 
                 long tail = (word & HASH_MASKS[semi_pos]);
-                hash ^= tail;
-                int bucket = xxh32(hash) & BUCKET_MASK;
+                int bucket = xxh32(word0 ^ tail) & BUCKET_MASK;
                 short val = parseNum(num_word, dot_pos);
 
                 while (true) {
                     var node = map[bucket];
                     if (node == null) {
-                        map[bucket] = new Node(row_addr, word0, tail, val, keylen);
+                        map[bucket] = new Node(row_addr, word0, tail, val);
                         break;
                     }
                     if (node.word0 == word0 && node.tail == tail) {
@@ -258,6 +265,9 @@ static final Node[] parse(int thread_id, long start, long end) {
                 continue;
             }
 
+            // why not going for more? tested, slower
+
+            long hash = word0;
             while (semipos_code == 0) {
                 hash ^= word;
                 addr += 8;
@@ -273,17 +283,16 @@ static final Node[] parse(int thread_id, long start, long end) {
             addr += (dot_pos >>> 3) + 4;
 
             long tail = (word & HASH_MASKS[semi_pos]);
-            hash ^= tail;
-            int bucket = xxh32(hash) & BUCKET_MASK;
+            int bucket = xxh32(hash ^ tail) & BUCKET_MASK;
             short val = parseNum(num_word, dot_pos);
 
             while (true) {
                 var node = map[bucket];
                 if (node == null) {
-                    map[bucket] = new Node(row_addr, word0, tail, val, keylen);
+                    map[bucket] = new Node(row_addr, word0, tail, val);
                     break;
                 }
-                if (node.contentEquals(row_addr, word0, tail)) {
+                if (node.contentEquals(row_addr, word0, tail, keylen)) {
                     node.add(val);
                     break;
                 }
@@ -292,6 +301,7 @@ static final Node[] parse(int thread_id, long start, long end) {
                     cls++;
             }
         }
+
         if (SHOW_ANALYSIS) {
             debug("Thread %d collision = %d", thread_id, cls);
         }
@@ -307,8 +317,6 @@ private static void spawnWorker() throws IOException {
         workerCommand.add("--worker");
         new ProcessBuilder()
                 .command(workerCommand)
-                .inheritIO()
-                .redirectOutput(ProcessBuilder.Redirect.PIPE)
                 .start()
                 .getInputStream()
                 .transferTo(System.out);
@@ -333,43 +341,29 @@ public static void main(String[] args) throws InterruptedException, IOException
             // processing
             var ptrs = slice(start_addr, end_addr, chunk_size, cpu_cnt);
 
-            TreeMap<String, Node> ms = new TreeMap<>();
-            int[] lenhist = new int[64]; // length histogram
-
-            List<List<Node>> maps = IntStream.range(0, cpu_cnt)
+            List<List<Stat>> maps = IntStream.range(0, cpu_cnt)
                     .mapToObj(thread_id -> parse(thread_id, ptrs[thread_id], ptrs[thread_id + 1]))
                     .map(map -> {
-                        List<Node> nodes = new ArrayList<>();
+                        List<Stat> stats = new ArrayList<>();
                         for (var node : map) {
                             if (node == null)
                                 continue;
-                            node.calcKey();
-                            nodes.add(node);
+                            stats.add(new Stat(node));
                         }
-                        return nodes;
+                        return stats;
                     })
                     .parallel()
                     .toList();
 
-            for (var nodes : maps) {
-                for (var node : nodes) {
-                    if (SHOW_ANALYSIS) {
-                        int kl = node.keylen & (lenhist.length - 1);
-                        lenhist[kl] += node.count;
-                    }
-                    var stat = ms.putIfAbsent(node.key, node);
+            TreeMap<String, Stat> ms = new TreeMap<>();
+            for (var stats : maps) {
+                for (var s : stats) {
+                    var stat = ms.putIfAbsent(s.key, s);
                     if (stat != null)
-                        stat.merge(node);
+                        stat.node.merge(s.node);
                 }
             }
 
-            if (SHOW_ANALYSIS) {
-                debug("Total = " + Arrays.stream(lenhist).sum());
-                debug("Length_histogram = "
-                        + Arrays.toString(Arrays.stream(lenhist).map(x -> (int) (x * 1.0e-7)).toArray()));
-                return;
-            }
-
             // print result
             System.out.println(ms);
             System.out.close();

From d5cedd6a35204a7cb30d354170c3eeeee35bfaf3 Mon Sep 17 00:00:00 2001
From: Arman Sharif <armandino@gmail.com>
Date: Thu, 25 Jan 2024 13:59:18 -0800
Subject: [PATCH 130/268] armandino: minimise hash collisions + other
 improvements (#585)

---
 calculate_average_armandino.sh                | 11 ++-
 prepare_armandino.sh                          | 25 +++++
 .../onebrc/CalculateAverage_armandino.java    | 96 +++++++++----------
 3 files changed, 79 insertions(+), 53 deletions(-)
 create mode 100755 prepare_armandino.sh

diff --git a/calculate_average_armandino.sh b/calculate_average_armandino.sh
index 6ac5c1654..21a4f8ccf 100755
--- a/calculate_average_armandino.sh
+++ b/calculate_average_armandino.sh
@@ -15,6 +15,11 @@
 #  limitations under the License.
 #
 
-
-JAVA_OPTS="--enable-preview -da -dsa -Xms128m -Xmx128m -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_armandino
+if [ -f target/CalculateAverage_armandino_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_armandino_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_armandino_image
+else
+    echo "Chosing to run the app in JVM mode as no native image was found, use prepare_armandino.sh to generate." 1>&2
+    JAVA_OPTS="--enable-preview -da -dsa -Xms128m -Xmx128m -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_armandino
+fi
diff --git a/prepare_armandino.sh b/prepare_armandino.sh
new file mode 100755
index 000000000..19a71f9ea
--- /dev/null
+++ b/prepare_armandino.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_armandino_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_armandino\$Scanner"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_armandino_image dev.morling.onebrc.CalculateAverage_armandino
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java b/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
index dce3a3302..d825e77f9 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
@@ -45,6 +45,7 @@ public class CalculateAverage_armandino {
     private static final byte DOT = 46;
     private static final byte MINUS = 45;
     private static final byte ZERO_DIGIT = 48;
+    private static final int PRIME = 1117;
     private static final Unsafe UNSAFE = getUnsafe();
 
     public static void main(String[] args) throws Exception {
@@ -78,7 +79,7 @@ private SimpleMap process(final long chunkStart, final long chunkEnd) {
                 byte b;
 
                 while ((b = UNSAFE.getByte(i++)) != SEMICOLON) {
-                    keyHash = 31 * keyHash + b;
+                    keyHash = PRIME * keyHash + b;
                 }
 
                 final int keyLength = (int) (i - keyAddress - 1);
@@ -114,13 +115,14 @@ private SimpleMap process(final long chunkStart, final long chunkEnd) {
                 stats.sum += measurement;
                 stats.count++;
             }
+
             return map;
         }
     }
 
     private static class Stats implements Comparable<Stats> {
         private String key;
-        private final byte[] keyBytes;
+        private final long keyAddress;
         private final int keyLength;
         private final int keyHash;
         private int min = Integer.MAX_VALUE;
@@ -129,17 +131,15 @@ private static class Stats implements Comparable<Stats> {
         private long sum;
 
         private Stats(long keyAddress, int keyLength, int keyHash) {
+            this.keyAddress = keyAddress;
             this.keyLength = keyLength;
-            this.keyBytes = new byte[keyLength];
             this.keyHash = keyHash;
-
-            for (int i = 0; i < keyLength; i++) {
-                keyBytes[i] = UNSAFE.getByte(keyAddress++);
-            }
         }
 
         String getKey() {
             if (key == null) {
+                var keyBytes = new byte[keyLength];
+                UNSAFE.copyMemory(null, keyAddress, keyBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, keyLength);
                 key = new String(keyBytes, 0, keyLength, UTF_8);
             }
             return key;
@@ -230,37 +230,6 @@ Stream<Stats> stream() {
             return Arrays.stream(table).filter(Objects::nonNull);
         }
 
-        private void resize() {
-            var copy = new SimpleMap(table.length * 2);
-            for (Stats s : table) {
-                if (s != null) {
-                    final int pos = (copy.table.length - 1) & s.keyHash;
-                    int i = pos;
-
-                    if (copy.table[i] == null) {
-                        copy.table[i] = s;
-                        continue;
-                    }
-
-                    while (i < copy.table.length && copy.table[i] != null) {
-                        i++;
-                    }
-                    if (i == copy.table.length) {
-                        i = pos;
-                        while (i >= 0 && copy.table[i] != null) {
-                            i--;
-                        }
-                    }
-                    if (i < 0) {
-                        // shouldn't happen because put() is called after increasing size
-                        throw new IllegalStateException("table is full");
-                    }
-                    copy.table[i] = s;
-                }
-            }
-            table = copy.table;
-        }
-
         Stats putStats(final int keyHash, final long keyAddress, final int keyLength) {
             final int pos = (table.length - 1) & keyHash;
 
@@ -291,22 +260,49 @@ Stats putStats(final int keyHash, final long keyAddress, final int keyLength) {
             return putStats(keyHash, keyAddress, keyLength);
         }
 
-        private boolean keysEqual(Stats stats, long keyAddress, final int keyLength) {
-            if (stats.keyLength != keyLength) {
-                return false;
-            }
-            for (int i = 0; i < keyLength; i++) {
-                if (stats.keyBytes[i] != UNSAFE.getByte(keyAddress++)) {
-                    return false;
-                }
-            }
-            return true;
-        }
-
         private static Stats createAt(Stats[] table, long keyAddress, int keyLength, int key, int i) {
             Stats stats = new Stats(keyAddress, keyLength, key);
             table[i] = stats;
             return stats;
         }
+
+        private static boolean keysEqual(Stats stats, long keyAddress, final int keyLength) {
+            // credit: abeobk
+            long xsum = 0;
+            int n = keyLength & 0xF8;
+            for (int i = 0; i < n; i += 8) {
+                xsum |= (UNSAFE.getLong(stats.keyAddress + i) ^ UNSAFE.getLong(keyAddress + i));
+            }
+            return xsum == 0;
+        }
+
+        private void resize() {
+            var copy = new SimpleMap(table.length * 2);
+            for (Stats s : table) {
+                if (s != null) {
+                    final int pos = (copy.table.length - 1) & s.keyHash;
+                    int i = pos;
+                    if (copy.table[i] == null) {
+                        copy.table[i] = s;
+                        continue;
+                    }
+                    while (i < copy.table.length && copy.table[i] != null) {
+                        i++;
+                    }
+                    if (i == copy.table.length) {
+                        i = pos;
+                        while (i >= 0 && copy.table[i] != null) {
+                            i--;
+                        }
+                    }
+                    if (i < 0) {
+                        // if we reach here it's a bug!
+                        throw new IllegalStateException("table is full");
+                    }
+                    copy.table[i] = s;
+                }
+            }
+            table = copy.table;
+        }
     }
 }

From 0bd167557183922825a0a9ec3a1d347f4ba24b2f Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Thu, 25 Jan 2024 22:03:05 +0000
Subject: [PATCH 131/268] Down to 14s locally (#583)

Use flat array for stats.
Use simd for line termination

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 .../CalculateAverage_ianopolousfast.java      | 118 ++++++++++--------
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index f1b4e7bf5..ab960dfec 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -41,7 +41,7 @@
  *
  * Timings on 4 core i7-7500U CPU @ 2.70GHz:
  * average_baseline: 4m48s
- * ianopolous:         15s
+ * ianopolous:         14s
 */
 public class CalculateAverage_ianopolousfast {
 
@@ -60,7 +60,7 @@ public static void main(String[] args) throws Exception {
         MemorySegment mmap = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize, arena);
         int nChunks = filesize < 4 * 1024 * 1024 ? 1 : Runtime.getRuntime().availableProcessors();
         long chunkSize = (filesize + nChunks - 1) / nChunks;
-        List<List<List<Stat>>> allResults = IntStream.range(0, nChunks)
+        List<Stat[]> allResults = IntStream.range(0, nChunks)
                 .parallel()
                 .mapToObj(i -> parseStats(i * chunkSize, Math.min((i + 1) * chunkSize, filesize), mmap))
                 .toList();
@@ -69,7 +69,7 @@ public static void main(String[] args) throws Exception {
                 .parallel()
                 .flatMap(f -> {
                     try {
-                        return f.stream().filter(Objects::nonNull).flatMap(Collection::stream);
+                        return Arrays.stream(f).filter(Objects::nonNull);
                     }
                     catch (Exception e) {
                         e.printStackTrace();
@@ -104,24 +104,23 @@ public static Stat createStation(long start, long end, MemorySegment buffer) {
         return new Stat(stationBuffer);
     }
 
-    public static Stat dedupeStation(long start, long end, long hash, MemorySegment buffer, List<List<Stat>> stations) {
+    public static Stat dedupeStation(long start, long end, long hash, MemorySegment buffer, Stat[] stations) {
         int index = hashToIndex(hash, MAX_STATIONS);
-        List<Stat> matches = stations.get(index);
-        if (matches == null) {
-            List<Stat> value = new ArrayList<>();
+        Stat match = stations[index];
+        if (match == null) {
             Stat res = createStation(start, end, buffer);
-            value.add(res);
-            stations.set(index, value);
+            stations[index] = res;
             return res;
         }
         else {
-            for (int i = 0; i < matches.size(); i++) {
-                Stat s = matches.get(i);
-                if (matchingStationBytes(start, end, buffer, s))
-                    return s;
+            while (match != null) {
+                if (matchingStationBytes(start, end, buffer, match))
+                    return match;
+                index = (index + 1) % stations.length;
+                match = stations[index];
             }
             Stat res = createStation(start, end, buffer);
-            matches.add(res);
+            stations[index] = res;
             return res;
         }
     }
@@ -130,50 +129,38 @@ static long maskHighBytes(long d, int nbytes) {
         return d & (-1L << ((8 - nbytes) * 8));
     }
 
-    public static Stat parseStation(long lineStart, MemorySegment buffer, List<List<Stat>> stations) {
+    public static Stat parseStation(long lineStart, MemorySegment buffer, Stat[] stations) {
         ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart, ByteOrder.nativeOrder());
         int keySize = line.compare(VectorOperators.EQ, ';').firstTrue();
 
         long first8 = buffer.get(LONG_LAYOUT, lineStart);
-        if (keySize == BYTE_SPECIES.vectorByteSize()) {
+        long second8 = 0;
+        if (keySize <= 8) {
+            first8 = maskHighBytes(first8, keySize & 0x07);
+        }
+        else if (keySize <= 16) {
+            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
+        }
+        else if (keySize == BYTE_SPECIES.vectorByteSize()) {
             while (buffer.get(JAVA_BYTE, lineStart + keySize) != ';') {
                 keySize++;
             }
-            long second8 = buffer.get(LONG_LAYOUT, lineStart + 8);
-            long hash = first8 ^ second8; // todo include other bytes
-            return dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
+            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
         }
-
-        if (keySize <= 8) {
-            first8 = maskHighBytes(first8, keySize & 0x07);
-        }
-        long second8 = keySize <= 8 ? 0 : maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
         long hash = first8 ^ second8; // todo include later bytes
         return dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
     }
 
-    public static int getDot(long d) {
-        // from Hacker's Delight page 92
-        d = d ^ 0x2e2e2e2e2e2e2e2eL;
-        long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
-        y = ~(y | d | 0x7f7f7f7f7f7f7f7fL);
-        return Long.numberOfLeadingZeros(y) >> 3;
-    }
-
     public static short getMinus(long d) {
-        d = d & 0xff00000000000000L;
-        d = d ^ 0x2d2d2d2d2d2d2d2dL;
-        long y = (d & 0x7f7f7f7f7f7f7f7fL) + 0x7f7f7f7f7f7f7f7fL;
-        y = ~(y | d | 0x7f7f7f7f7f7f7f7fL);
-        return (short) ((Long.numberOfLeadingZeros(y) >> 6) - 1);
+        return ((d & 0xff00000000000000L) ^ 0x2d00000000000000L) != 0 ? 0 : (short) -1;
     }
 
-    public static long processTemperature(long lineSplit, MemorySegment buffer, Stat station) {
+    public static long processTemperature(long lineSplit, int size, MemorySegment buffer, Stat station) {
         long d = buffer.get(LONG_LAYOUT, lineSplit);
         // negative is either 0 or -1
         short negative = getMinus(d);
         d = d << (negative * -8);
-        int dotIndex = getDot(d);
+        int dotIndex = size - 2 + negative;
         d = (d >> 8) | 0x30000000_00000000L; // add a leading 0 digit
         d = d >> 8 * (5 - dotIndex);
         short temperature = (short) ((byte) d - '0' +
@@ -181,10 +168,41 @@ public static long processTemperature(long lineSplit, MemorySegment buffer, Stat
                 100 * (((byte) (d >> 24)) - '0'));
         temperature = (short) ((temperature ^ negative) - negative); // negative treatment inspired by merkitty
         station.add(temperature);
-        return lineSplit - negative + dotIndex + 3;
+        return lineSplit + size + 1;
+    }
+
+    private static long parseLine(long lineStart, MemorySegment buffer, Stat[] stations) {
+        ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart, ByteOrder.nativeOrder());
+        int lineSize = line.compare(VectorOperators.EQ, '\n').firstTrue();
+        int index = lineSize;
+        while (index == BYTE_SPECIES.vectorByteSize()) {
+            index = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart + lineSize,
+                    ByteOrder.nativeOrder()).compare(VectorOperators.EQ, '\n').firstTrue();
+            lineSize += index;
+        }
+        int keySize = lineSize - 6 + ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart + lineSize - 6,
+                ByteOrder.nativeOrder()).compare(VectorOperators.EQ, ';').firstTrue();
+
+        long first8 = buffer.get(LONG_LAYOUT, lineStart);
+        long second8 = 0;
+        if (keySize <= 8) {
+            first8 = maskHighBytes(first8, keySize & 0x07);
+        }
+        else if (keySize <= 16) {
+            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
+        }
+        else if (keySize == BYTE_SPECIES.vectorByteSize()) {
+            while (buffer.get(JAVA_BYTE, lineStart + keySize) != ';') {
+                keySize++;
+            }
+            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
+        }
+        long hash = first8 ^ second8; // todo include later bytes
+        Stat station = dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
+        return processTemperature(lineStart + keySize + 1, lineSize - keySize - 1, buffer, station);
     }
 
-    public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySegment buffer) {
+    public static Stat[] parseStats(long startByte, long endByte, MemorySegment buffer) {
         // read first partial line
         if (startByte > 0) {
             for (int i = 0; i < MAX_LINE_LENGTH; i++) {
@@ -195,9 +213,7 @@ public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySe
             }
         }
 
-        List<List<Stat>> stations = new ArrayList<>(MAX_STATIONS);
-        for (int i = 0; i < MAX_STATIONS; i++)
-            stations.add(null);
+        Stat[] stations = new Stat[MAX_STATIONS];
 
         // Handle reading the very last few lines in the file
         // this allows us to not worry about reading beyond the end
@@ -218,7 +234,12 @@ public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySe
             int index = 0;
             while (endByte + index < buffer.byteSize()) {
                 Stat station = parseStation(index, end, stations);
-                index = (int) processTemperature(index + station.namelen + 1, end, station);
+                int tempSize = 3;
+                if (end.get(JAVA_BYTE, index + station.namelen + 5) == '\n')
+                    tempSize = 4;
+                if (end.get(JAVA_BYTE, index + station.namelen + 6) == '\n')
+                    tempSize = 5;
+                index = (int) processTemperature(index + station.namelen + 1, tempSize, end, station);
             }
         }
 
@@ -226,10 +247,9 @@ public static List<List<Stat>> parseStats(long startByte, long endByte, MemorySe
         return stations;
     }
 
-    private static void innerloop(long startByte, long endByte, MemorySegment buffer, List<List<Stat>> stations) {
+    private static void innerloop(long startByte, long endByte, MemorySegment buffer, Stat[] stations) {
         while (startByte < endByte) {
-            Stat station = parseStation(startByte, buffer, stations);
-            startByte = processTemperature(startByte + station.namelen + 1, buffer, station);
+            startByte = parseLine(startByte, buffer, stations);
         }
     }
 
@@ -278,4 +298,4 @@ public String toString() {
             return round((double) min) + "/" + round(((double) total) / count) + "/" + round((double) max);
         }
     }
-}
\ No newline at end of file
+}

From 65d2c1b0c911579c0f04b9f996ed357f57ac10e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antonio=20Mu=C3=B1oz?= <antoniogmc@gmail.com>
Date: Thu, 25 Jan 2024 23:07:20 +0100
Subject: [PATCH 132/268] tonivade improved solution (#582)

* tonivade improved not using HashMap

* use java 21.0.2

* same hash same station

* remove unused parameter in sameSation

* use length too

* refactor parallelization

* use parallel GC

* refactor

* refactor
---
 calculate_average_tonivade.sh                 |   2 +-
 prepare_tonivade.sh                           |   2 +-
 .../onebrc/CalculateAverage_tonivade.java     | 274 ++++++++++--------
 3 files changed, 148 insertions(+), 130 deletions(-)

diff --git a/calculate_average_tonivade.sh b/calculate_average_tonivade.sh
index 5e160f9ff..a484a5343 100755
--- a/calculate_average_tonivade.sh
+++ b/calculate_average_tonivade.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="-Xmx1G -Xms1G -XX:+AlwaysPreTouch --enable-preview"
+JAVA_OPTS="-Xmx1G -Xms1G -XX:+AlwaysPreTouch -XX:+UseParallelGC -XX:-UseCompressedOops --enable-preview"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_tonivade
diff --git a/prepare_tonivade.sh b/prepare_tonivade.sh
index 66b23f679..cdf474f87 100755
--- a/prepare_tonivade.sh
+++ b/prepare_tonivade.sh
@@ -17,4 +17,4 @@
 
 # Uncomment below to use sdk
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-tem 1>&2
+sdk use java 21.0.2-tem 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java b/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java
index bd284888a..9deb3f229 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java
@@ -15,9 +15,6 @@
  */
 package dev.morling.onebrc;
 
-import static java.util.Comparator.comparing;
-import static java.util.stream.Collectors.joining;
-
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
@@ -26,9 +23,8 @@
 import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.StructuredTaskScope;
 import java.util.concurrent.StructuredTaskScope.Subtask;
@@ -37,32 +33,16 @@ public class CalculateAverage_tonivade {
 
     private static final String FILE = "./measurements.txt";
 
-    private static final int EOL = 10;
-    private static final int MINUS = 45;
-    private static final int SEMICOLON = 59;
+    private static final int MIN_CHUNK_SIZE = 1024;
+    private static final int MAX_NAME_LENGTH = 128;
+    private static final int MAX_TEMP_LENGTH = 8;
 
     public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
-        var result = readFile();
-
-        var measurements = getMeasurements(result);
-
-        System.out.println(measurements);
-    }
-
-    static record PartialResult(int end, Map<Name, Station> map) {
-
-        void merge(Map<Name, Station> result) {
-            map.forEach((name, station) -> result.merge(name, station, Station::merge));
-        }
+        System.out.println(readFile());
     }
 
-    private static String getMeasurements(Map<Name, Station> result) {
-        return result.values().stream().sorted(comparing(Station::getName))
-                .map(Station::asString).collect(joining(", ", "{", "}"));
-    }
-
-    private static Map<Name, Station> readFile() throws IOException, InterruptedException, ExecutionException {
-        Map<Name, Station> result = HashMap.newHashMap(10_000);
+    private static Map<String, Station> readFile() throws IOException, InterruptedException, ExecutionException {
+        Map<String, Station> result = new TreeMap<>();
         try (var channel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {
             long consumed = 0;
             long remaining = channel.size();
@@ -70,8 +50,11 @@ private static Map<Name, Station> readFile() throws IOException, InterruptedExce
                 var buffer = channel.map(
                         MapMode.READ_ONLY, consumed, Math.min(remaining, Integer.MAX_VALUE));
 
-                if (buffer.remaining() <= 1024) {
-                    var partialResult = readChunk(buffer, 0, buffer.remaining());
+                int chunks = Runtime.getRuntime().availableProcessors();
+                int chunkSize = buffer.remaining() / chunks;
+                int leftover = buffer.remaining() % chunks;
+                if (chunkSize < MIN_CHUNK_SIZE) {
+                    var partialResult = new Chunk(buffer, 0, buffer.remaining()).read();
 
                     consumed += partialResult.end();
                     remaining -= partialResult.end();
@@ -79,17 +62,12 @@ private static Map<Name, Station> readFile() throws IOException, InterruptedExce
                     partialResult.merge(result);
                 }
                 else {
-                    var chunks = Runtime.getRuntime().availableProcessors();
-                    var chunksSize = buffer.remaining() / chunks;
-                    var leftover = buffer.remaining() % chunks;
-
                     try (var scope = new StructuredTaskScope.ShutdownOnFailure()) {
                         var tasks = new ArrayList<Subtask<PartialResult>>(chunks);
                         for (int i = 0; i < chunks; i++) {
-                            int start = i * chunksSize;
-                            int length = chunksSize + (i < chunks ? leftover : 0);
-                            tasks.add(scope.fork(() -> readChunk(
-                                    buffer, findStart(buffer, start), start + length)));
+                            int start = i * chunkSize;
+                            int length = chunkSize + (i < chunks ? leftover : 0);
+                            tasks.add(scope.fork(new Chunk(buffer, start, length)::read));
                         }
                         scope.join();
                         scope.throwIfFailed();
@@ -106,132 +84,154 @@ private static Map<Name, Station> readFile() throws IOException, InterruptedExce
         return result;
     }
 
-    private static PartialResult readChunk(ByteBuffer buffer, int start, int end) {
-        final byte[] name = new byte[128];
-        final byte[] temp = new byte[8];
-        final Map<Name, Station> map = HashMap.newHashMap(1000);
-        int position = start;
-        while (position < end) {
-            int semicolon = readName(buffer, position, end - position, name);
-            if (semicolon < 0) {
-                break;
-            }
+    static final class Chunk {
 
-            int endOfLine = readTemp(buffer, semicolon + 1, end - semicolon - 1, temp);
-            if (endOfLine < 0) {
-                break;
-            }
+        private static final int EOL = 10;
+        private static final int MINUS = 45;
+        private static final int SEMICOLON = 59;
 
-            map.computeIfAbsent(new Name(name, semicolon - position), Station::new)
-                    .add(parseTemp(temp, endOfLine - semicolon - 1));
+        final ByteBuffer buffer;
+        final int start;
+        final int end;
 
-            // skip end of line
-            position = endOfLine + 1;
+        final byte[] name = new byte[MAX_NAME_LENGTH];
+        final byte[] temp = new byte[MAX_TEMP_LENGTH];
+        final Stations stations = new Stations();
+
+        int hash;
+
+        Chunk(ByteBuffer buffer, int start, int length) {
+            this.buffer = buffer;
+            this.start = findStart(buffer, start);
+            this.end = start + length;
         }
-        return new PartialResult(position, map);
-    }
 
-    private static int findStart(ByteBuffer buffer, int start) {
-        if (start > 0 && buffer.get(start - 1) != EOL) {
-            for (int i = start - 2; i > 0; i--) {
-                byte b = buffer.get(i);
-                if (b == EOL) {
-                    return i + 1;
+        private static int findStart(ByteBuffer buffer, int start) {
+            if (start > 0 && buffer.get(start - 1) != EOL) {
+                for (int i = start - 2; i > 0; i--) {
+                    byte b = buffer.get(i);
+                    if (b == EOL) {
+                        return i + 1;
+                    }
                 }
             }
+            return start;
         }
-        return start;
-    }
 
-    private static int readName(ByteBuffer buffer, int offset, int length, byte[] name) {
-        return readUntil(buffer, offset, length, name, SEMICOLON);
-    }
+        PartialResult read() {
+            int position = start;
+            while (position < end) {
+                int semicolon = readName(position, end - position);
+                if (semicolon < 0) {
+                    break;
+                }
 
-    private static int readTemp(ByteBuffer buffer, int offset, int length, byte[] percentage) {
-        return readUntil(buffer, offset, length, percentage, EOL);
-    }
+                int endOfLine = readTemp(semicolon + 1, end - semicolon - 1);
+                if (endOfLine < 0) {
+                    break;
+                }
+
+                stations.find(name, semicolon - position, hash)
+                        .add(parseTemp(temp, endOfLine - semicolon - 1));
 
-    private static int readUntil(ByteBuffer buffer, int offset, int length, byte[] array, int target) {
-        for (int i = 0; i < length; i++) {
-            byte b = buffer.get(i + offset);
-            if (b == target) {
-                return i + offset;
+                // skip end of line
+                position = endOfLine + 1;
             }
-            array[i] = b;
+            return new PartialResult(position, stations.buckets);
+        }
+
+        private int readName(int offset, int length) {
+            hash = 1;
+            for (int i = 0; i < length; i++) {
+                byte b = buffer.get(i + offset);
+                if (b == SEMICOLON) {
+                    return i + offset;
+                }
+                name[i] = b;
+                hash = 31 * hash + b;
+            }
+            return -1;
+        }
+
+        private int readTemp(int offset, int length) {
+            for (int i = 0; i < length; i++) {
+                byte b = buffer.get(i + offset);
+                if (b == EOL) {
+                    return i + offset;
+                }
+                temp[i] = b;
+            }
+            return -1;
         }
-        return -1;
-    }
 
-    // non null double between -99.9 (inclusive) and 99.9 (inclusive), always with one fractional digit
-    private static int parseTemp(byte[] value, int length) {
-        int period = length - 2;
-        if (value[0] == MINUS) {
-            int left = parseLeft(value, 1, period - 1);
+        // non null double between -99.9 (inclusive) and 99.9 (inclusive), always with one fractional digit
+        private static int parseTemp(byte[] value, int length) {
+            int period = length - 2;
+            if (value[0] == MINUS) {
+                int left = parseLeft(value, 1, period - 1);
+                int right = toInt(value[period + 1]);
+                return -(left + right);
+            }
+            int left = parseLeft(value, 0, period);
             int right = toInt(value[period + 1]);
-            return -(left + right);
+            return left + right;
         }
-        int left = parseLeft(value, 0, period);
-        int right = toInt(value[period + 1]);
-        return left + right;
-    }
 
-    private static int parseLeft(byte[] value, int start, int length) {
-        if (length == 1) {
-            return toInt(value[start]) * 10;
+        private static int parseLeft(byte[] value, int start, int length) {
+            if (length == 1) {
+                return toInt(value[start]) * 10;
+            }
+            // two chars
+            int a = toInt(value[start]) * 100;
+            int b = toInt(value[start + 1]) * 10;
+            return a + b;
         }
-        // two chars
-        int a = toInt(value[start]) * 100;
-        int b = toInt(value[start + 1]) * 10;
-        return a + b;
-    }
 
-    private static int toInt(byte c) {
-        return c - 48;
+        private static int toInt(byte c) {
+            return c - 48;
+        }
     }
 
-    static final class Name {
+    static final class Stations {
 
-        private final byte[] value;
+        private static final int NUMBER_OF_BUCKETS = 1000;
+        private static final int BUCKET_SIZE = 50;
 
-        Name(byte[] source, int length) {
-            value = new byte[length];
-            System.arraycopy(source, 0, value, 0, length);
-        }
-
-        @Override
-        public int hashCode() {
-            return Arrays.hashCode(value);
-        }
+        final Station[][] buckets = new Station[NUMBER_OF_BUCKETS][BUCKET_SIZE];
 
-        @Override
-        public boolean equals(Object obj) {
-            if (obj instanceof Name other) {
-                return Arrays.equals(value, other.value);
+        Station find(byte[] name, int length, int hash) {
+            var bucket = buckets[Math.abs(hash % NUMBER_OF_BUCKETS)];
+            for (int i = 0; i < BUCKET_SIZE; i++) {
+                if (bucket[i] == null) {
+                    bucket[i] = new Station(name, length, hash);
+                    return bucket[i];
+                }
+                else if (bucket[i].sameName(length, hash)) {
+                    return bucket[i];
+                }
             }
-            return false;
-        }
-
-        @Override
-        public String toString() {
-            return new String(value, StandardCharsets.UTF_8);
+            throw new IllegalStateException("no more space left");
         }
     }
 
     static final class Station {
 
-        private final Name name;
+        private final byte[] name;
+        private final int hash;
 
-        private int min = Integer.MAX_VALUE;
-        private int max = Integer.MIN_VALUE;
+        private int min = 1000;
+        private int max = -1000;
         private int sum;
         private long count;
 
-        Station(Name name) {
-            this.name = name;
+        Station(byte[] source, int length, int hash) {
+            name = new byte[length];
+            System.arraycopy(source, 0, name, 0, length);
+            this.hash = hash;
         }
 
         String getName() {
-            return name.toString();
+            return new String(name, StandardCharsets.UTF_8);
         }
 
         void add(int value) {
@@ -249,8 +249,13 @@ Station merge(Station other) {
             return this;
         }
 
-        String asString() {
-            return name + "=" + toDouble(min) + "/" + round(mean()) + "/" + toDouble(max);
+        @Override
+        public String toString() {
+            return toDouble(min) + "/" + round(mean()) + "/" + toDouble(max);
+        }
+
+        boolean sameName(int length, int hash) {
+            return name.length == length && this.hash == hash;
         }
 
         private double mean() {
@@ -265,4 +270,17 @@ private double round(double value) {
             return Math.round(value * 10.) / 10.;
         }
     }
+
+    static record PartialResult(int end, Station[][] stations) {
+
+        void merge(Map<String, Station> result) {
+            for (Station[] bucket : stations) {
+                for (Station station : bucket) {
+                    if (station != null) {
+                        result.merge(station.getName(), station, Station::merge);
+                    }
+                }
+            }
+        }
+    }
 }

From b20e7365e72c092f1800ea814e85d51f9bf53917 Mon Sep 17 00:00:00 2001
From: Dmitry Bufistov <112496477+dmitry-midokura@users.noreply.github.com>
Date: Thu, 25 Jan 2024 23:09:22 +0100
Subject: [PATCH 133/268] Second submission to keep a bit of dignity (#581)

* Dmitry challenge

* Dmitry submit 2.

Use MemorySegment of FileChannle and Unsafe
to read bytes from disk. 4 seconds speedup in local test
from 20s to 16s.
---
 calculate_average_dmitry-midokura.sh          |   1 +
 .../onebrc/CalculateAverage_bufistov.java     | 382 +++++++++---------
 2 files changed, 183 insertions(+), 200 deletions(-)

diff --git a/calculate_average_dmitry-midokura.sh b/calculate_average_dmitry-midokura.sh
index e4d1366db..1bb529b8d 100755
--- a/calculate_average_dmitry-midokura.sh
+++ b/calculate_average_dmitry-midokura.sh
@@ -17,4 +17,5 @@
 
 
 #JAVA_OPTS="-verbose:gc"
+JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_bufistov $1 $2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java b/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java
index db6040385..178a6e11e 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java
@@ -15,11 +15,17 @@
  */
 package dev.morling.onebrc;
 
+import sun.misc.Unsafe;
+
 import static java.lang.Math.toIntExact;
 
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
@@ -32,66 +38,6 @@
 import java.io.IOException;
 import java.util.concurrent.Future;
 
-class ResultRow {
-    byte[] station;
-
-    String stationString;
-    long min, max, count, suma;
-
-    ResultRow() {
-    }
-
-    ResultRow(byte[] station, long value) {
-        this.station = new byte[station.length];
-        System.arraycopy(station, 0, this.station, 0, station.length);
-        this.min = value;
-        this.max = value;
-        this.count = 1;
-        this.suma = value;
-    }
-
-    ResultRow(long value) {
-        this.min = value;
-        this.max = value;
-        this.count = 1;
-        this.suma = value;
-    }
-
-    void setStation(MappedByteBuffer byteBuffer, int startPosition, int endPosition) {
-        this.station = new byte[endPosition - startPosition];
-        byteBuffer.slice(startPosition, station.length).get(this.station, 0, station.length);
-    }
-
-    public String toString() {
-        stationString = new String(station, StandardCharsets.UTF_8);
-        return stationString + "=" + round(min / 10.0) + "/" + round(suma / 10.0 / count) + "/" + round(max / 10.0);
-    }
-
-    private double round(double value) {
-        return Math.round(value * 10.0) / 10.0;
-    }
-
-    ResultRow update(long newValue) {
-        this.count += 1;
-        this.suma += newValue;
-        if (newValue < this.min) {
-            this.min = newValue;
-        }
-        else if (newValue > this.max) {
-            this.max = newValue;
-        }
-        return this;
-    }
-
-    ResultRow merge(ResultRow another) {
-        this.count += another.count;
-        this.suma += another.suma;
-        this.min = Math.min(this.min, another.min);
-        this.max = Math.max(this.max, another.max);
-        return this;
-    }
-}
-
 class ByteArrayWrapper {
     private final byte[] data;
 
@@ -110,100 +56,176 @@ public int hashCode() {
     }
 }
 
-class OpenHash {
-    ResultRow[] data;
-    int dataSizeMask;
+public class CalculateAverage_bufistov {
 
-    // ResultRow metrics = new ResultRow();
+    static class ResultRow {
+        byte[] station;
 
-    public OpenHash(int capacityPow2) {
-        assert capacityPow2 <= 20;
-        int dataSize = 1 << capacityPow2;
-        dataSizeMask = dataSize - 1;
-        data = new ResultRow[dataSize];
-    }
+        String stationString;
+        long min, max, count, suma;
 
-    int hashByteArray(byte[] array) {
-        int result = 0;
-        long mask = 0;
-        for (int i = 0; i < array.length; ++i, mask = ((mask + 1) & 3)) {
-            result += array[i] << mask;
+        ResultRow() {
         }
-        return result & dataSizeMask;
-    }
 
-    void merge(byte[] station, long value, int hashValue) {
-        while (data[hashValue] != null && !Arrays.equals(station, data[hashValue].station)) {
-            hashValue += 1;
-            hashValue &= dataSizeMask;
+        ResultRow(byte[] station, long value) {
+            this.station = new byte[station.length];
+            System.arraycopy(station, 0, this.station, 0, station.length);
+            this.min = value;
+            this.max = value;
+            this.count = 1;
+            this.suma = value;
         }
-        if (data[hashValue] == null) {
-            data[hashValue] = new ResultRow(station, value);
+
+        ResultRow(long value) {
+            this.min = value;
+            this.max = value;
+            this.count = 1;
+            this.suma = value;
         }
-        else {
-            data[hashValue].update(value);
+
+        void setStation(long startPosition, long endPosition) {
+            this.station = new byte[(int) (endPosition - startPosition)];
+            for (int i = 0; i < this.station.length; ++i) {
+                this.station[i] = UNSAFE.getByte(startPosition + i);
+            }
         }
-        // metrics.update(delta);
-    }
 
-    void merge(byte[] station, long value) {
-        merge(station, value, hashByteArray(station));
-    }
+        public String toString() {
+            stationString = new String(station, StandardCharsets.UTF_8);
+            return stationString + "=" + round(min / 10.0) + "/" + round(suma / 10.0 / count) + "/" + round(max / 10.0);
+        }
 
-    void merge(MappedByteBuffer byteBuffer, final int startPosition, final int endPosition, int hashValue, final long value) {
-        while (data[hashValue] != null && !equalsToStation(byteBuffer, startPosition, endPosition, data[hashValue].station)) {
-            hashValue += 1;
-            hashValue &= dataSizeMask;
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
         }
-        if (data[hashValue] == null) {
-            data[hashValue] = new ResultRow(value);
-            data[hashValue].setStation(byteBuffer, startPosition, endPosition);
+
+        void update(long newValue) {
+            this.count += 1;
+            this.suma += newValue;
+            if (newValue < this.min) {
+                this.min = newValue;
+            }
+            else if (newValue > this.max) {
+                this.max = newValue;
+            }
         }
-        else {
-            data[hashValue].update(value);
+
+        ResultRow merge(ResultRow another) {
+            this.count += another.count;
+            this.suma += another.suma;
+            this.min = Math.min(this.min, another.min);
+            this.max = Math.max(this.max, another.max);
+            return this;
         }
     }
 
-    boolean equalsToStation(MappedByteBuffer byteBuffer, int startPosition, int endPosition, byte[] station) {
-        if (endPosition - startPosition != station.length) {
-            return false;
+    static class OpenHash {
+        ResultRow[] data;
+        int dataSizeMask;
+
+        // ResultRow metrics = new ResultRow();
+
+        public OpenHash(int capacityPow2) {
+            assert capacityPow2 <= 20;
+            int dataSize = 1 << capacityPow2;
+            dataSizeMask = dataSize - 1;
+            data = new ResultRow[dataSize];
         }
-        for (int i = 0; i < station.length; ++i, ++startPosition) {
-            if (byteBuffer.get(startPosition) != station[i])
+
+        int hashByteArray(byte[] array) {
+            int result = 0;
+            long mask = 0;
+            for (int i = 0; i < array.length; ++i, mask = ((mask + 1) & 3)) {
+                result += array[i] << mask;
+            }
+            return result & dataSizeMask;
+        }
+
+        void merge(byte[] station, long value, int hashValue) {
+            while (data[hashValue] != null && !Arrays.equals(station, data[hashValue].station)) {
+                hashValue += 1;
+                hashValue &= dataSizeMask;
+            }
+            if (data[hashValue] == null) {
+                data[hashValue] = new ResultRow(station, value);
+            }
+            else {
+                data[hashValue].update(value);
+            }
+            // metrics.update(delta);
+        }
+
+        void merge(byte[] station, long value) {
+            merge(station, value, hashByteArray(station));
+        }
+
+        void merge(final long startPosition, long endPosition, int hashValue, long value) {
+            while (data[hashValue] != null && !equalsToStation(startPosition, endPosition, data[hashValue].station)) {
+                hashValue += 1;
+                hashValue &= dataSizeMask;
+            }
+            if (data[hashValue] == null) {
+                data[hashValue] = new ResultRow(value);
+                data[hashValue].setStation(startPosition, endPosition);
+            }
+            else {
+                data[hashValue].update(value);
+            }
+        }
+
+        boolean equalsToStation(long startPosition, long endPosition, byte[] station) {
+            if (endPosition - startPosition != station.length) {
                 return false;
+            }
+            for (int i = 0; i < station.length; ++i, ++startPosition) {
+                if (UNSAFE.getByte(startPosition) != station[i])
+                    return false;
+            }
+            return true;
         }
-        return true;
-    }
 
-    HashMap<ByteArrayWrapper, ResultRow> toJavaHashMap() {
-        HashMap<ByteArrayWrapper, ResultRow> result = new HashMap<>(20000);
-        for (int i = 0; i < data.length; ++i) {
-            if (data[i] != null) {
-                var key = new ByteArrayWrapper(data[i].station);
-                result.put(key, data[i]);
+        HashMap<ByteArrayWrapper, ResultRow> toJavaHashMap() {
+            HashMap<ByteArrayWrapper, ResultRow> result = new HashMap<>(20000);
+            for (int i = 0; i < data.length; ++i) {
+                if (data[i] != null) {
+                    var key = new ByteArrayWrapper(data[i].station);
+                    result.put(key, data[i]);
+                }
             }
+            return result;
         }
-        return result;
     }
-}
 
-public class CalculateAverage_bufistov {
+    static final Unsafe UNSAFE;
+
+    static {
+        try {
+            Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            unsafe.setAccessible(true);
+            UNSAFE = (Unsafe) unsafe.get(Unsafe.class);
+        }
+        catch (Throwable e) {
+            throw new RuntimeException(e);
+        }
+    }
 
     static final long LINE_SEPARATOR = '\n';
 
     public static class FileRead implements Callable<HashMap<ByteArrayWrapper, ResultRow>> {
 
         private final FileChannel fileChannel;
+
         private long currentLocation;
-        private int bytesToRead;
+        private long bytesToRead;
+
+        private static final int hashCapacityPow2 = 18;
 
-        private final int hashCapacityPow2 = 18;
-        private final int hashCapacityMask = (1 << hashCapacityPow2) - 1;
+        static final int hashCapacityMask = (1 << hashCapacityPow2) - 1;
 
-        public FileRead(long startLocation, int bytesToRead, FileChannel fileChannel) {
+        public FileRead(FileChannel fileChannel, long startLocation, long bytesToRead, boolean firstSegment) {
+            this.fileChannel = fileChannel;
             this.currentLocation = startLocation;
             this.bytesToRead = bytesToRead;
-            this.fileChannel = fileChannel;
         }
 
         @Override
@@ -211,21 +233,13 @@ public HashMap<ByteArrayWrapper, ResultRow> call() throws IOException {
             try {
                 OpenHash openHash = new OpenHash(hashCapacityPow2);
                 log("Reading the channel: " + currentLocation + ":" + bytesToRead);
-                byte[] suffix = new byte[128];
                 if (currentLocation > 0) {
-                    toLineBegin(suffix);
-                }
-                while (bytesToRead > 0) {
-                    int bufferSize = Math.min(1 << 24, bytesToRead);
-                    MappedByteBuffer byteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, currentLocation, bufferSize);
-                    bytesToRead -= bufferSize;
-                    currentLocation += bufferSize;
-                    int suffixBytes = 0;
-                    if (currentLocation < fileChannel.size()) {
-                        suffixBytes = toLineBegin(suffix);
-                    }
-                    processChunk(byteBuffer, bufferSize, suffix, suffixBytes, openHash);
+                    toLineBeginPrefix();
                 }
+                toLineBeginSuffix();
+                var memorySegment = fileChannel.map(FileChannel.MapMode.READ_ONLY, currentLocation, bytesToRead, Arena.global());
+                currentLocation = memorySegment.address();
+                processChunk(openHash);
                 log("Done Reading the channel: " + currentLocation + ":" + bytesToRead);
                 return openHash.toJavaHashMap();
             }
@@ -240,39 +254,40 @@ byte getByte(long position) throws IOException {
             return byteBuffer.get();
         }
 
-        int toLineBegin(byte[] suffix) throws IOException {
-            int bytesConsumed = 0;
-            if (getByte(currentLocation - 1) != LINE_SEPARATOR) {
-                while (getByte(currentLocation) != LINE_SEPARATOR) { // Small bug here if last chunk is less than a line and has no '\n' at the end. Valid input should have '\n' for all rows.
-                    suffix[bytesConsumed++] = getByte(currentLocation);
-                    ++currentLocation;
-                    --bytesToRead;
-                }
+        void toLineBeginPrefix() throws IOException {
+            while (getByte(currentLocation - 1) != LINE_SEPARATOR) {
                 ++currentLocation;
                 --bytesToRead;
             }
-            return bytesConsumed;
         }
 
-        void processChunk(MappedByteBuffer byteBuffer, int bufferSize, byte[] suffix, int suffixBytes, OpenHash result) {
-            int nameBegin = 0;
-            int nameEnd = -1;
-            int numberBegin = -1;
+        void toLineBeginSuffix() throws IOException {
+            while (getByte(currentLocation + bytesToRead - 1) != LINE_SEPARATOR) {
+                ++bytesToRead;
+            }
+        }
+
+        void processChunk(OpenHash result) {
+            long nameBegin = currentLocation;
+            long nameEnd = -1;
+            long numberBegin = -1;
             int currentHash = 0;
             int currentMask = 0;
             int nameHash = 0;
-            for (int currentPosition = 0; currentPosition < bufferSize; ++currentPosition) {
-                byte nextByte = byteBuffer.get(currentPosition);
+            long end = currentLocation + bytesToRead;
+            byte nextByte;
+            for (; currentLocation < end; ++currentLocation) {
+                nextByte = UNSAFE.getByte(currentLocation);
                 if (nextByte == ';') {
-                    nameEnd = currentPosition;
-                    numberBegin = currentPosition + 1;
+                    nameEnd = currentLocation;
+                    numberBegin = currentLocation + 1;
                     nameHash = currentHash & hashCapacityMask;
                 }
                 else if (nextByte == LINE_SEPARATOR) {
-                    long value = getValue(byteBuffer, numberBegin, currentPosition);
-                    // log("Station name: '" + getStationName(byteBuffer, nameBegin, nameEnd) + "' value: " + value + " hash: " + nameHash);
-                    result.merge(byteBuffer, nameBegin, nameEnd, nameHash, value);
-                    nameBegin = currentPosition + 1;
+                    long value = getValue(numberBegin, currentLocation);
+                    // log("Station name: '" + getStationName(nameBegin, nameEnd) + "' value: " + value + " hash: " + nameHash);
+                    result.merge(nameBegin, nameEnd, nameHash, value);
+                    nameBegin = currentLocation + 1;
                     currentHash = 0;
                     currentMask = 0;
                 }
@@ -281,38 +296,14 @@ else if (nextByte == LINE_SEPARATOR) {
                     currentMask = (currentMask + 1) & 3;
                 }
             }
-            if (nameBegin < bufferSize) {
-                byte[] lastLine = new byte[bufferSize - nameBegin + suffixBytes];
-                byte[] prefix = new byte[bufferSize - nameBegin];
-                byteBuffer.slice(nameBegin, prefix.length).get(prefix, 0, prefix.length);
-                System.arraycopy(prefix, 0, lastLine, 0, prefix.length);
-                System.arraycopy(suffix, 0, lastLine, prefix.length, suffixBytes);
-                processLastLine(lastLine, result);
-            }
         }
 
-        void processLastLine(byte[] lastLine, OpenHash result) {
-            int numberBegin = -1;
-            byte[] stationName = null;
-            for (int i = 0; i < lastLine.length; ++i) {
-                if (lastLine[i] == ';') {
-                    stationName = new byte[i];
-                    System.arraycopy(lastLine, 0, stationName, 0, stationName.length);
-                    numberBegin = i + 1;
-                    break;
-                }
-            }
-            long value = getValue(lastLine, numberBegin);
-            // log("Station name: '" + new String(stationName, StandardCharsets.UTF_8) + "' value: " + value);
-            result.merge(stationName, value);
-        }
-
-        long getValue(MappedByteBuffer byteBuffer, int startLocation, int endLocation) {
-            byte nextByte = byteBuffer.get(startLocation);
+        long getValue(long startLocation, long endLocation) {
+            byte nextByte = UNSAFE.getByte(startLocation);
             boolean negate = nextByte == '-';
             long result = negate ? 0 : nextByte - '0';
-            for (int i = startLocation + 1; i < endLocation; ++i) {
-                nextByte = byteBuffer.get(i);
+            for (long i = startLocation + 1; i < endLocation; ++i) {
+                nextByte = UNSAFE.getByte(i);
                 if (nextByte != '.') {
                     result *= 10;
                     result += nextByte - '0';
@@ -321,23 +312,11 @@ long getValue(MappedByteBuffer byteBuffer, int startLocation, int endLocation) {
             return negate ? -result : result;
         }
 
-        long getValue(byte[] lastLine, int startLocation) {
-            byte nextByte = lastLine[startLocation];
-            boolean negate = nextByte == '-';
-            long result = negate ? 0 : nextByte - '0';
-            for (int i = startLocation + 1; i < lastLine.length; ++i) {
-                nextByte = lastLine[i];
-                if (nextByte != '.') {
-                    result *= 10;
-                    result += nextByte - '0';
-                }
+        String getStationName(long from, long to) {
+            byte[] bytes = new byte[(int) (to - from)];
+            for (int i = 0; i < bytes.length; ++i) {
+                bytes[i] = UNSAFE.getByte(from + i);
             }
-            return negate ? -result : result;
-        }
-
-        String getStationName(MappedByteBuffer byteBuffer, int from, int to) {
-            byte[] bytes = new byte[to - from];
-            byteBuffer.slice(from, to - from).get(0, bytes);
             return new String(bytes, StandardCharsets.UTF_8);
         }
     }
@@ -349,7 +328,7 @@ public static void main(String[] args) throws Exception {
         }
         log("InputFile: " + fileName);
         FileInputStream fileInputStream = new FileInputStream(fileName);
-        int numThreads = 32;
+        int numThreads = 2 * Runtime.getRuntime().availableProcessors();
         if (args.length > 1) {
             numThreads = Integer.parseInt(args[1]);
         }
@@ -363,9 +342,12 @@ public static void main(String[] args) throws Exception {
 
         long startLocation = 0;
         ArrayList<Future<HashMap<ByteArrayWrapper, ResultRow>>> results = new ArrayList<>(numThreads);
+        var fileChannel = FileChannel.open(Paths.get(fileName));
+        boolean firstSegment = true;
         while (remaining_size > 0) {
             long actualSize = Math.min(chunk_size, remaining_size);
-            results.add(executor.submit(new FileRead(startLocation, toIntExact(actualSize), channel)));
+            results.add(executor.submit(new FileRead(fileChannel, startLocation, toIntExact(actualSize), firstSegment)));
+            firstSegment = false;
             remaining_size -= actualSize;
             startLocation += actualSize;
         }

From 94e29982f909b96fc57204acadd9fe31d6f37def Mon Sep 17 00:00:00 2001
From: Roman Stoffel <roman.stoffel@gamlor.info>
Date: Thu, 25 Jan 2024 23:12:10 +0100
Subject: [PATCH 134/268] Updates for gamlerhart: Simpler & Faster (#580)

* Update with Rounding Bugfix

* Simplification of Merging Results

* More Plain Java Code for Value Storage

* Improve Performance by Stupid Hash

Drop around 3 seconds on my machine by
simplifying the hash to be ridicules stupid,
but faster.

* Fix outdated comment
---
 .../onebrc/CalculateAverage_gamlerhart.java   | 217 +++++++-----------
 1 file changed, 86 insertions(+), 131 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java
index 2f73a3348..5d0a4bdbb 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java
@@ -24,7 +24,10 @@
 import java.nio.channels.FileChannel;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.TreeMap;
+import java.util.stream.Collector;
+import java.util.stream.Collectors;
 
 import static java.lang.Double.doubleToRawLongBits;
 import static java.lang.Double.longBitsToDouble;
@@ -69,19 +72,16 @@ public static void main(String[] args) throws Exception {
             ArrayList<Section> sections = splitFileIntoSections(fileSize, fileContent);
 
             var loopBound = byteVec.loopBound(fileSize) - vecLen;
-            PrivateHashMap result = sections.stream()
+            var result = sections.stream()
                     .parallel()
                     .map(s -> {
                         return parseSection(s.start, s.end, loopBound, fileContent);
-                    }).reduce((mine, other) -> {
-                        assert mine != other;
-                        mine.mergeFrom(fileContent, other);
-                        return mine;
-                    })
-                    .get();
+                    });
 
             var measurements = new TreeMap<String, ResultRow>();
-            result.fill(fileContent, measurements);
+            result.forEachOrdered(m -> {
+                m.fillMerge(fileContent, measurements);
+            });
             System.out.println(measurements);
         }
     }
@@ -160,11 +160,22 @@ private static class PrivateHashMap {
         // Encoding:
         // - Key: long
         // - 48 bits index, 16 bits length
-        // - min: double
-        // - max: double
-        // - sum: double
-        // - double: double
-        final long[] keyValues = new long[SIZE * 5];
+        final long[] keys = new long[SIZE];
+        final Value[] values = new Value[SIZE];
+
+        private class Value {
+            public Value(double min, double max, double sum, long count) {
+                this.min = min;
+                this.max = max;
+                this.sum = sum;
+                this.count = count;
+            }
+
+            public double min;
+            public double max;
+            public double sum;
+            public long count;
+        }
 
         // int debug_size = 0;
 
@@ -179,43 +190,40 @@ public void add(MemorySegment file, long pos, int len, double val) {
         }
 
         private static int calculateHash(MemorySegment file, long pos, int len) {
-            int hashCode = 1;
-            int i = 0;
-            int intBound = (len / 4) * 4;
-            for (; i < intBound; i += 4) {
-                int v = file.get(INT_UNALIGNED_BIG_ENDIAN, pos + i);
-                hashCode = 31 * hashCode + v;
+            if (len > 4) {
+                return file.get(INT_UNALIGNED_BIG_ENDIAN, pos) + 31 * len;
             }
-            for (; i < len; i++) {
-                int v = file.get(JAVA_BYTE, pos + i);
-                hashCode = 31 * hashCode + v;
+            else {
+                int hashCode = len;
+                int i = 0;
+                for (; i < len; i++) {
+                    int v = file.get(JAVA_BYTE, pos + i);
+                    hashCode = 31 * hashCode + v;
+                }
+                return hashCode;
             }
-            return hashCode;
         }
 
         private void doAdd(MemorySegment file, int hash, long pos, int len, double val) {
             int slot = hash & MASK;
             for (var probe = 0; probe < 20000; probe++) {
-                var iSl = ((slot + probe) & MASK) * 5;
-                var slotEntry = keyValues[iSl];
+                var iSl = ((slot + probe) & MASK);
+                var slotEntry = keys[iSl];
 
                 var emtpy = slotEntry == 0;
                 if (emtpy) {
                     long keyInfo = pos << SHIFT_POS | len;
-                    long valueBits = doubleToRawLongBits(val);
-                    keyValues[iSl] = keyInfo;
-                    keyValues[iSl + 1] = valueBits;
-                    keyValues[iSl + 2] = valueBits;
-                    keyValues[iSl + 3] = valueBits;
-                    keyValues[iSl + 4] = 1;
+                    keys[iSl] = keyInfo;
+                    values[iSl] = new Value(val, val, val, 1);
                     // debug_size++;
                     return;
                 }
                 else if (isSameEntry(file, slotEntry, pos, len)) {
-                    keyValues[iSl + 1] = doubleToRawLongBits(Math.min(longBitsToDouble(keyValues[iSl + 1]), val));
-                    keyValues[iSl + 2] = doubleToRawLongBits(Math.max(longBitsToDouble(keyValues[iSl + 2]), val));
-                    keyValues[iSl + 3] = doubleToRawLongBits(longBitsToDouble(keyValues[iSl + 3]) + val);
-                    keyValues[iSl + 4] = keyValues[iSl + 4] + 1;
+                    var vE = values[iSl];
+                    vE.min = Math.min(vE.min, val);
+                    vE.max = Math.max(vE.max, val);
+                    vE.sum = vE.sum + val;
+                    vE.count++;
                     return;
                 }
                 else {
@@ -268,118 +276,65 @@ private static boolean isSame(MemorySegment file, long i1, long i2, int len) {
             return true;
         }
 
-        public PrivateHashMap mergeFrom(MemorySegment file, PrivateHashMap other) {
-            for (int slot = 0; slot < other.keyValues.length / 5; slot++) {
-                int srcI = slot * 5;
-                long keyE = other.keyValues[srcI];
-                if (keyE != 0) {
-                    long oPos = (keyE & MASK_POS) >> SHIFT_POS;
-                    int oLen = (int) (keyE & MASK_LEN);
-                    addMerge(file, other, srcI, oPos, oLen);
-                }
-            }
-            return this;
-        }
-
-        private void addMerge(MemorySegment file, PrivateHashMap other, int srcI, long oPos, int oLen) {
-            int slot = calculateHash(file, oPos, oLen) & MASK;
-            for (var probe = 0; probe < 20000; probe++) {
-                var iSl = ((slot + probe) & MASK) * 5;
-                var slotEntry = keyValues[iSl];
-
-                var emtpy = slotEntry == 0;
-                // var debugKey = new String(file.asSlice(oPos, oLen).toArray(JAVA_BYTE));
-                if (emtpy) {
-                    // if (debugKey.equals("Cabo San Lucas")) {
-                    // System.out.println("=> VALUES (init) " + debugKey + "@" + iSl + " max: " + longBitsToDouble(other.keyValues[srcI + 2]) + "," + longBitsToDouble(keyValues[iSl + 2]));
-                    // }
-                    keyValues[iSl] = other.keyValues[srcI];
-                    keyValues[iSl + 1] = other.keyValues[srcI + 1];
-                    keyValues[iSl + 2] = other.keyValues[srcI + 2];
-                    keyValues[iSl + 3] = other.keyValues[srcI + 3];
-                    keyValues[iSl + 4] = other.keyValues[srcI + 4];
-                    // debug_size++;
-                    return;
-                }
-                else if (isSameEntry(file, slotEntry, oPos, oLen)) {
-                    // if (debugKey.equals("Cabo San Lucas")) {
-                    // System.out.println("=> VALUES (merge) " + "@" + iSl + debugKey + " max: " + longBitsToDouble(other.keyValues[srcI + 2]) + ","
-                    // + longBitsToDouble(keyValues[iSl + 2]) + "=> "
-                    // + Math.max(longBitsToDouble(keyValues[iSl + 2]), longBitsToDouble(other.keyValues[srcI + 2])));
-                    // }
-                    keyValues[iSl + 1] = doubleToRawLongBits(Math.min(longBitsToDouble(keyValues[iSl + 1]), longBitsToDouble(other.keyValues[srcI + 1])));
-                    keyValues[iSl + 2] = doubleToRawLongBits(Math.max(longBitsToDouble(keyValues[iSl + 2]), longBitsToDouble(other.keyValues[srcI + 2])));
-                    keyValues[iSl + 3] = doubleToRawLongBits(longBitsToDouble(keyValues[iSl + 3]) + longBitsToDouble(other.keyValues[srcI + 3]));
-                    keyValues[iSl + 4] = keyValues[iSl + 4] + other.keyValues[srcI + 4];
-                    // if (debugKey.equals("Cabo San Lucas")) {
-                    // System.out.println("=> VALUES (after-merge) self: "+ "@" + iSl + System.identityHashCode(this) + ":"+ debugKey + " max: " +
-                    // + longBitsToDouble(keyValues[iSl + 2]) + "=> ");
-                    // }
-                    return;
-                }
-                else {
-                    // long keyPos = (slotEntry & MASK_POS) >> SHIFT_POS;
-                    // int keyLen = (int) (slotEntry & MASK_LEN);
-                    // System.out.println("Colliding " + new String(file.asSlice(pos,len).toArray(ValueLayout.JAVA_BYTE)) +
-                    // " with key" + new String(file.asSlice(keyPos,keyLen).toArray(ValueLayout.JAVA_BYTE)) +
-                    // " hash " + hash + " slot " + slot + "+" + probe + " at " + iSl);
-                    // debug_reprobeMax = Math.max(debug_reprobeMax, probe);
-                }
-            }
-            throw new IllegalStateException("More than 20000 reprobes");
-        }
-
-        public void fill(MemorySegment file, TreeMap<String, ResultRow> treeMap) {
-            for (int i = 0; i < keyValues.length / 5; i++) {
-                var ji = i * 5;
-                long keyE = keyValues[ji];
+        public void fillMerge(MemorySegment file, TreeMap<String, ResultRow> treeMap) {
+            for (int i = 0; i < keys.length; i++) {
+                var ji = i;
+                long keyE = keys[ji];
                 if (keyE != 0) {
                     long keyPos = (keyE & MASK_POS) >> SHIFT_POS;
                     int keyLen = (int) (keyE & MASK_LEN);
                     byte[] keyBytes = new byte[keyLen];
                     MemorySegment.copy(file, JAVA_BYTE, keyPos, keyBytes, 0, keyLen);
                     var key = new String(keyBytes);
-                    var min = longBitsToDouble(keyValues[ji + 1]);
-                    var max = longBitsToDouble(keyValues[ji + 2]);
-                    var sum = longBitsToDouble(keyValues[ji + 3]);
-                    var count = keyValues[ji + 4];
-                    treeMap.put(key, new ResultRow(min, sum / count, max));
+                    var vE = values[ji];
+                    var min = vE.min;
+                    var max = vE.max;
+                    var sum = vE.sum;
+                    var count = vE.count;
+                    treeMap.compute(key, (k, e) -> {
+                        if (e == null) {
+                            return new ResultRow(min, max, sum, count);
+                        }
+                        else {
+                            return new ResultRow(Math.min(e.min, min), Math.max(e.max, max), e.sum + sum, e.count + count);
+                        }
+                    });
                 }
             }
         }
 
-        public String debugPrint(MemorySegment file) {
-            StringBuilder b = new StringBuilder();
-            for (int i = 0; i < keyValues.length / 5; i++) {
-                var ji = i * 5;
-                long keyE = keyValues[ji];
-                if (keyE != 0) {
-                    long keyPos = (keyE & MASK_POS) >> SHIFT_POS;
-                    int keyLen = (int) (keyE & MASK_LEN);
-                    byte[] keyBytes = new byte[keyLen];
-                    MemorySegment.copy(file, JAVA_BYTE, keyPos, keyBytes, 0, keyLen);
-                    var key = new String(keyBytes);
-                    var min = longBitsToDouble(keyValues[ji + 1]);
-                    var max = longBitsToDouble(keyValues[ji + 2]);
-                    var sum = longBitsToDouble(keyValues[ji + 3]);
-                    var count = keyValues[ji + 4];
-                    b.append("{").append(key).append("@").append(ji)
-                            .append(",").append(min)
-                            .append(",").append(max)
-                            .append(",").append(sum)
-                            .append(",").append(count).append("},");
-                }
-            }
-            return b.toString();
-        }
+        // public String debugPrint(MemorySegment file) {
+        // StringBuilder b = new StringBuilder();
+        // for (int i = 0; i < keyValues.length / 5; i++) {
+        // var ji = i * 5;
+        // long keyE = keyValues[ji];
+        // if (keyE != 0) {
+        // long keyPos = (keyE & MASK_POS) >> SHIFT_POS;
+        // int keyLen = (int) (keyE & MASK_LEN);
+        // byte[] keyBytes = new byte[keyLen];
+        // MemorySegment.copy(file, JAVA_BYTE, keyPos, keyBytes, 0, keyLen);
+        // var key = new String(keyBytes);
+        // var min = longBitsToDouble(keyValues[ji + 1]);
+        // var max = longBitsToDouble(keyValues[ji + 2]);
+        // var sum = longBitsToDouble(keyValues[ji + 3]);
+        // var count = keyValues[ji + 4];
+        // b.append("{").append(key).append("@").append(ji)
+        // .append(",").append(min)
+        // .append(",").append(max)
+        // .append(",").append(sum)
+        // .append(",").append(count).append("},");
+        // }
+        // }
+        // return b.toString();
+        // }
     }
 
     record Section(long start, long end) {
     }
 
-    private static record ResultRow(double min, double mean, double max) {
+    private static record ResultRow(double min, double max, double sum, long count) {
         public String toString() {
-            return round(min) + "/" + round(mean) + "/" + round(max);
+            return round(min) + "/" + round(((Math.round(sum * 10.0) / 10.0) / count)) + "/" + round(max);
         }
 
         private double round(double value) {

From cb7423d386dda99bd1df6ca7005c1ce4edc9444b Mon Sep 17 00:00:00 2001
From: Alberto Venturini <aventurini@gmail.com>
Date: Fri, 26 Jan 2024 00:17:39 +0200
Subject: [PATCH 135/268] Contribution by albertoventurini (#578)

* Contribution by albertoventurini

* Shave off a couple of hundreds of milliseconds, by making an assumption on temperature readings

* Parse reading without loop, inspired by other solutions

* Use all cores

* Small improvements, only allocate 247 positions instead of 256

---------

Co-authored-by: Alberto Venturini <alberto.venturini@accso.de>
---
 calculate_average_albertoventurini.sh         |  19 ++
 .../CalculateAverage_albertoventurini.java    | 299 ++++++++++++++++++
 2 files changed, 318 insertions(+)
 create mode 100755 calculate_average_albertoventurini.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java

diff --git a/calculate_average_albertoventurini.sh b/calculate_average_albertoventurini.sh
new file mode 100755
index 000000000..d997264b0
--- /dev/null
+++ b/calculate_average_albertoventurini.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-server -Xnoclassgc"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java b/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java
new file mode 100644
index 000000000..406c75985
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java
@@ -0,0 +1,299 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+/**
+ * == File reading ==
+ * The file is read using RandomAccessFile, and split into chunks. Each thread is assigned a chunk.
+ * E.g. if the file size is 100, and we have two threads, the first thread will read from 0 to 49,
+ * the second from 50 to 99.
+ * Each chunk is aligned to the next end-of-line (or to the end-of-file), so that each thread
+ * consumes full input lines.
+ * Further, each file chunk is split into smaller pieces (byte arrays), with each piece up to 2^22 bytes.
+ * This particular size seems to work best on my machine.
+ * == Data structure ==
+ * Each thread stores its results in a prefix tree (trie). Each node in the trie represents
+ * one byte of a location's name. Non-ASCII characters are represented by multiple nodes in the trie.
+ * Each leaf contains the statistics for a location.
+ */
+public class CalculateAverage_albertoventurini {
+
+    // The maximum byte that can ever appear in a UTF-8-encoded string is 11110111, i.e., 0xF7
+    private static final int MAX_UTF8_BYTE_VALUE = 0xF7;
+
+    // Define a prefix tree that is used to store results.
+    // Each node in the trie represents a byte (NOT character) from a location name.
+    // A nice side effect is, when traversing the trie to print results,
+    // the names will be printed in alphabetical order.
+    private static final class TrieNode {
+        final TrieNode[] children = new TrieNode[MAX_UTF8_BYTE_VALUE];
+        int min = Integer.MAX_VALUE;
+        int max = Integer.MIN_VALUE;
+        int sum;
+        int count;
+    }
+
+    private static final int TWO_BYTE_TO_INT = 480 + 48;
+    private static final int THREE_BYTE_TO_INT = 4800 + 480 + 48;
+
+    // Process a chunk and write results in a Trie rooted at 'root'.
+    private static void processChunk(final TrieNode root, final ChunkReader cr) {
+        while (cr.hasNext()) {
+            TrieNode node = root;
+
+            // Process the location name navigating through the trie
+            int b = cr.getNext() & 0xFF;
+            while (b != ';') {
+                if (node.children[b] == null) {
+                    node.children[b] = new TrieNode();
+                }
+                node = node.children[b];
+                b = cr.getNext() & 0xFF;
+            }
+
+            // Process the reading value (temperature)
+            int reading;
+
+            byte b1 = cr.getNext();
+            byte b2 = cr.getNext();
+            byte b3 = cr.getNext();
+            byte b4 = cr.getNext();
+            if (b2 == '.') { // value is n.n
+                reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
+                // b4 == \n
+            }
+            else {
+                if (b4 == '.') { // value is -nn.n
+                    reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
+                }
+                else if (b1 == '-') { // value is -n.n
+                    reading = -(b2 * 10 + b4 - TWO_BYTE_TO_INT);
+                }
+                else { // value is nn.n
+                    reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
+                }
+                cr.getNext(); // new line
+            }
+
+            node.min = Math.min(node.min, reading);
+            node.max = Math.max(node.max, reading);
+            node.sum += reading;
+            node.count++;
+        }
+    }
+
+    // Print results.
+    // Because there are multiple tries (one for each thread), this method
+    // aggregates results from all tries.
+    static class ResultPrinter {
+        // Contains the bytes for the current location name. 100 bytes should be enough
+        // to represent each location name encoded in UTF-8.
+        final byte[] bytes = new byte[100];
+
+        boolean firstOutput = true;
+
+        void printResults(final TrieNode[] roots) {
+            System.out.print("{");
+            printResultsRec(roots, bytes, 0);
+            System.out.println("}");
+        }
+
+        private static double round(long value) {
+            return Math.round(value) / 10.0;
+        }
+
+        // Find and print results recursively.
+        private void printResultsRec(final TrieNode[] nodes, final byte[] bytes, final int index) {
+            long min = Long.MAX_VALUE;
+            long max = Long.MIN_VALUE;
+            long sum = 0;
+            long count = 0;
+
+            for (final TrieNode node : nodes) {
+                if (node != null && node.count > 0) {
+                    min = Math.min(min, node.min);
+                    max = Math.max(max, node.max);
+                    sum += node.sum;
+                    count += node.count;
+                }
+            }
+
+            if (count > 0) {
+                final String location = new String(bytes, 0, index);
+                if (firstOutput) {
+                    firstOutput = false;
+                }
+                else {
+                    System.out.print(", ");
+                }
+                double mean = Math.round((double) sum / (double) count) / 10.0;
+                System.out.print(location + "=" + round(min) + "/" + mean + "/" + round(max));
+            }
+
+            for (int i = 0; i < MAX_UTF8_BYTE_VALUE; i++) {
+                final TrieNode[] childNodes = new TrieNode[nodes.length];
+                boolean shouldRecurse = false;
+                for (int j = 0; j < nodes.length; j++) {
+                    if (nodes[j] != null && nodes[j].children[i] != null) {
+                        childNodes[j] = nodes[j].children[i];
+
+                        // Only recurse if there's at least one trie that has non-null child for index 'i'.
+                        shouldRecurse = true;
+                    }
+                }
+                if (shouldRecurse) {
+                    bytes[index] = (byte) i;
+                    printResultsRec(childNodes, bytes, index + 1);
+                }
+
+            }
+        }
+    }
+
+    private static final String FILE = "./measurements.txt";
+
+    private static final class ChunkReader {
+        // Byte arrays of size 2^22 seem to have the best performance on my machine.
+        private static final int BYTE_ARRAY_SIZE = 1 << 22;
+        private final byte[] bytes;
+
+        private final RandomAccessFile file;
+        private final long chunkBegin;
+        private final long chunkLength;
+
+        private int readBytes = 0;
+
+        private int cursor = 0;
+        private long offset = 0;
+
+        ChunkReader(
+                    final RandomAccessFile file,
+                    final long chunkBegin,
+                    final long chunkLength) {
+            this.file = file;
+            this.chunkBegin = chunkBegin;
+            this.chunkLength = chunkLength;
+
+            int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
+            this.bytes = new byte[byteArraySize];
+
+            readNextBytes();
+        }
+
+        boolean hasNext() {
+            return (offset + cursor) < chunkLength;
+        }
+
+        byte getNext() {
+            if (cursor >= readBytes) {
+                readNextBytes();
+            }
+            return bytes[cursor++];
+        }
+
+        private void readNextBytes() {
+            try {
+                offset += readBytes;
+                synchronized (file) {
+                    file.seek(chunkBegin + offset);
+                    readBytes = file.read(bytes);
+                }
+                cursor = 0;
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    private static ChunkReader[] makeChunkReaders(
+                                                  final int count,
+                                                  final RandomAccessFile file)
+            throws Exception {
+
+        final ChunkReader[] chunkReaders = new ChunkReader[count];
+
+        // The total size of each chunk
+        final long chunkReaderSize = file.length() / count;
+
+        long previousPosition = 0;
+        long currentPosition;
+
+        for (int i = 0; i < count; i++) {
+            // Go to the end of the chunk
+            file.seek(chunkReaderSize * (i + 1));
+
+            // Align to the next end of line or end of file
+            try {
+                while (file.readByte() != '\n')
+                    ;
+            }
+            catch (EOFException e) {
+            }
+
+            currentPosition = file.getFilePointer();
+            long chunkBegin = previousPosition;
+            long chunkLength = currentPosition - previousPosition;
+            chunkReaders[i] = new ChunkReader(file, chunkBegin, chunkLength);
+
+            previousPosition = currentPosition;
+        }
+
+        return chunkReaders;
+    }
+
+    // Spin up threads and assign a file chunk to each one.
+    // Then use the 'ResultPrinter' class to aggregate and print the results.
+    private static void processWithChunkReaders() throws Exception {
+        final var randomAccessFile = new RandomAccessFile(FILE, "r");
+
+        final int nThreads = randomAccessFile.length() < 1 << 20 ? 1 : Runtime.getRuntime().availableProcessors();
+
+        final CountDownLatch latch = new CountDownLatch(nThreads);
+
+        final ChunkReader[] chunkReaders = makeChunkReaders(nThreads, randomAccessFile);
+        final TrieNode[] roots = new TrieNode[nThreads];
+        for (int i = 0; i < nThreads; i++) {
+            roots[i] = new TrieNode();
+        }
+
+        final ExecutorService executorService = Executors.newFixedThreadPool(nThreads);
+        for (int i = 0; i < nThreads; i++) {
+            final int idx = i;
+            executorService.submit(() -> {
+                processChunk(roots[idx], chunkReaders[idx]);
+                latch.countDown();
+            });
+        }
+        executorService.shutdown();
+        latch.await();
+
+        new ResultPrinter().printResults(roots);
+
+        executorService.close();
+    }
+
+    public static void main(String[] args) throws Exception {
+        processWithChunkReaders();
+    }
+}
\ No newline at end of file

From 27b867d10d601180a8adcbde09f6c530ab22c9c0 Mon Sep 17 00:00:00 2001
From: gonix <d.giedrius+github@gmail.com>
Date: Fri, 26 Jan 2024 00:37:20 +0200
Subject: [PATCH 136/268] CalculateAverage_gonix update (#579)

Minor updates here and there, shaves off ~5% of execution time on my machine.
---
 .../onebrc/CalculateAverage_gonix.java        | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
index 90f43601d..572c272ca 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
@@ -79,13 +79,13 @@ private static List<MappedByteBuffer> buildChunks(RandomAccessFile file) throws
 
 class Aggregator {
     private static final int MAX_STATIONS = 10_000;
-    private static final int MAX_STATION_SIZE = (100 * 4) / 8 + 5;
+    private static final int MAX_STATION_SIZE = Math.ceilDiv(100, 8) + 5;
     private static final int INDEX_SIZE = 1024 * 1024;
     private static final int INDEX_MASK = INDEX_SIZE - 1;
-    private static final int FLD_MAX = 0;
-    private static final int FLD_MIN = 1;
-    private static final int FLD_SUM = 2;
-    private static final int FLD_COUNT = 3;
+    private static final int FLD_COUNT = 0;
+    private static final int FLD_SUM = 1;
+    private static final int FLD_MIN = 2;
+    private static final int FLD_MAX = 3;
 
     // Poor man's hash map: hash code to offset in `mem`.
     private final int[] index;
@@ -205,27 +205,15 @@ public Stream<Entry> stream() {
 
     private void add(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
         int idx = hash & INDEX_MASK;
-        while (true) {
-            if (index[idx] != 0) {
-                int offset = index[idx];
-                if (keyEqual(offset, buf, start, tailAndLen)) {
-                    int pos = offset + (int) (tailAndLen & 0xFF) + 1;
-                    mem[pos + FLD_MIN] = Math.min((int) measurement, (int) mem[pos + FLD_MIN]);
-                    mem[pos + FLD_MAX] = Math.max((int) measurement, (int) mem[pos + FLD_MAX]);
-                    mem[pos + FLD_SUM] += measurement;
-                    mem[pos + FLD_COUNT] += 1;
-                    return;
-                }
-            }
-            else {
-                index[idx] = create(buf, start, tailAndLen, hash, measurement);
+        for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
+            if (update(index[idx], buf, start, tailAndLen, measurement)) {
                 return;
             }
-            idx = (idx + 1) & INDEX_MASK;
         }
+        index[idx] = create(buf, start, tailAndLen, measurement);
     }
 
-    private int create(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
+    private int create(ByteBuffer buf, int start, long tailAndLen, int measurement) {
         int offset = memUsed;
 
         mem[offset] = tailAndLen;
@@ -248,8 +236,8 @@ private int create(ByteBuffer buf, int start, long tailAndLen, int hash, int mea
         return offset;
     }
 
-    private boolean keyEqual(int offset, ByteBuffer buf, int start, long tailAndLen) {
-
+    private boolean update(int offset, ByteBuffer buf, int start, long tailAndLen, int measurement) {
+        var mem = this.mem;
         if (mem[offset] != tailAndLen) {
             return false;
         }
@@ -263,6 +251,16 @@ private boolean keyEqual(int offset, ByteBuffer buf, int start, long tailAndLen)
             memPos += 1;
             bufPos += 8;
         }
+
+        mem[memPos + FLD_COUNT] += 1;
+        mem[memPos + FLD_SUM] += measurement;
+        if (measurement < mem[memPos + FLD_MIN]) {
+            mem[memPos + FLD_MIN] = measurement;
+        }
+        if (measurement > mem[memPos + FLD_MAX]) {
+            mem[memPos + FLD_MAX] = measurement;
+        }
+
         return true;
     }
 

From 09b0d75477bf20de9a53a17cd793c34088c09304 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 25 Jan 2024 23:37:52 +0100
Subject: [PATCH 137/268] Leaderboard update

---
 README.md | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4cfd4f45c..da6c33ee2 100644
--- a/README.md
+++ b/README.md
@@ -42,22 +42,23 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.196 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-| 3 | 00:02.305 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:02.169 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.196 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 |   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
+|   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
 |   | 00:03.510 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
+|   | 00:03.824 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.854 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
-|   | 00:03.959 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
@@ -67,22 +68,24 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
+|   | 00:05.400 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
+|   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
+|   | 00:05.709 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |
-|   | 00:06.415 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.1-open | [Arman Sharif](https://github.com/armandino) | uses Unsafe |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) | uses Unsafe |
 |   | 00:06.635 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java)| 21.0.1-graal | [Laake Scates-Gervasi](https://github.com/justplainlaake) | GraalVM native binary, uses Unsafe |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
-|   | 00:06.670 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
 |   | 00:07.563 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java)| 21.0.1-graal | [3j5a](https://github.com/3j5a) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
+|   | 00:07.894 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java)| 21.0.2-tem | [Antonio Muñoz](https://github.com/tonivade) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
@@ -92,7 +95,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) | uses Unsafe |
 |   | 00:08.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [Adrià Cabeza](https://github.com/adriacabeza) |  |
 |   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
-|   | 00:08.689 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:08.752 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
 |   | 00:08.892 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fatroom.java)| 21.0.1-open | [Roman Romanchuk](https://github.com/fatroom) |  |
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
@@ -109,17 +111,17 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_palmr.java)| 21.0.1-open | [Nick Palmer](https://github.com/palmr) |  |
 |   | 00:11.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) | uses Unsafe |
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
+|   | 00:11.406 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java)| 21.0.1-graal | [gabrielfoo](https://github.com/gabrielfoo) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
+|   | 00:11.505 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
-|   | 00:12.051 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
 |   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
-|   | 00:13.498 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java)| 21.0.1-tem | [Antonio Muñoz](https://github.com/tonivade) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
@@ -147,6 +149,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
 |   | 00:29.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java)| 21.0.1-open | [Matteo Vaccari](https://github.com/xpmatteo) |  |
+|   | 00:30.635 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java)| 21.0.1-open | [Alberto Venturini](https://github.com/albertoventurini) |  |
 |   | 00:32.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_padreati.java)| 21.0.1-open | [Aurelian Tutuianu](https://github.com/padreati) |  |
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |
 |   | 00:35.875 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java)| 21.0.1-open | [MahmoudFawzyKhalil](https://github.com/MahmoudFawzyKhalil) |  |

From 457a36be639aae2186e641b6db1090965d9951f2 Mon Sep 17 00:00:00 2001
From: Jason Nochlin <91577+hundredwatt@users.noreply.github.com>
Date: Fri, 26 Jan 2024 10:22:35 -0700
Subject: [PATCH 138/268] Fix hundredwatt's entry on 10k dataset (#558)

* Improve hash function

* remove limit on number of cores

* fix calculation of boundaries between chunks

* fix IOOBE

---------

Co-authored-by: Jason Nochlin <hundredwatt@users.noreply.github.com>
---
 .../onebrc/CalculateAverage_hundredwatt.java  | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java b/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java
index 9d935ffce..24a173ae0 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java
@@ -31,7 +31,7 @@
 public class CalculateAverage_hundredwatt {
     private static final String FILE = "./measurements.txt";
     private static final int MAX_ROW_SIZE = 100 + 1 + 5 + 1; // 100 for city name, 1 for ;, 5 for temperature, 1 for \n
-    private static final int THREAD_COUNT = Math.min(8, Runtime.getRuntime().availableProcessors());
+    private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
     private static final long BUFFER_SIZE = 128 * 1024 * 1024; // 128MB
     private static final long CHUNK_SIZE = BUFFER_SIZE / THREAD_COUNT;
     private static final long FILE_CHUNK_SIZE = CHUNK_SIZE - MAX_ROW_SIZE;
@@ -209,10 +209,10 @@ private static int processChunk(ByteBuffer bb, HashTable hashTable, long start,
         short temperature_value;
         int hashInt;
 
-        int i = 0;
+        int rc = 0;
         int end = (int) (size - MAX_ROW_SIZE);
-        while (position < end) {
-            i++;
+        while (position <= end) {
+            // rc++;
             offset = -1;
 
             // Parse city name
@@ -257,11 +257,11 @@ private static int processChunk(ByteBuffer bb, HashTable hashTable, long start,
 
             position = position + newlinePos / 8 + 2; // +1 for \n
 
-            hashInt = (int) (hash ^ (hash >> 32));
+            hashInt = (int) (hash ^ (hash >> 32) ^ (hash >> 17));
 
             hashTable.putOrMerge(hashInt, offset + 1, key, temperature_value);
         }
-        return i;
+        return rc;
     }
 
     public static void main(String[] args) throws IOException {
@@ -282,7 +282,7 @@ public static void main(String[] args) throws IOException {
                         byte[] trailing = new byte[MAX_ROW_SIZE * 2];
                         fileChannel.read(ByteBuffer.wrap(trailing), Math.max(0, fileSize - MAX_ROW_SIZE));
                         var rc = processChunk(ByteBuffer.wrap(trailing), hashTable, Math.max(0, fileSize - MAX_ROW_SIZE),
-                                MAX_ROW_SIZE + Math.min(fileSize, MAX_ROW_SIZE));
+                                MAX_ROW_SIZE + Math.min(fileSize, MAX_ROW_SIZE) - 1);
                         // rowCount.addAndGet(rc);
                         return hashTable;
 
@@ -292,11 +292,16 @@ public static void main(String[] args) throws IOException {
                     }
                 }
 
+                // if file is smaller than max row size, we're done b/c the trailing bytes handler processed the whole file
+                if (fileSize <= MAX_ROW_SIZE) {
+                    return hashTable;
+                }
+
                 while (start < fileSize) {
                     long end = Math.min(start + CHUNK_SIZE, fileSize);
                     MappedByteBuffer bb = null;
                     try {
-                        bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, end - start);
+                        bb = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, Math.min(end - start + 8, fileSize - start));
                     }
                     catch (IOException e) {
                         throw new RuntimeException(e);

From 22c188b148ac09d32a3dc34cb45d83695ae50a96 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Fri, 26 Jan 2024 18:23:07 +0100
Subject: [PATCH 139/268] Leaderboard update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index da6c33ee2..e43beb96c 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.510 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
-|   | 00:03.714 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
 |   | 00:03.824 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.854 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |

From 769884426b3c204152b5cf2524acfd05d2e4d293 Mon Sep 17 00:00:00 2001
From: rcasteltrione <38444917+rcasteltrione@users.noreply.github.com>
Date: Sat, 27 Jan 2024 14:43:51 +0100
Subject: [PATCH 140/268] Initial submission (#588)

* Initial submission

* fixed not executable scripts
---
 calculate_average_rcasteltrione.sh            |  20 ++
 prepare_rcasteltrione.sh                      |  19 ++
 .../CalculateAverage_rcasteltrione.java       | 309 ++++++++++++++++++
 3 files changed, 348 insertions(+)
 create mode 100755 calculate_average_rcasteltrione.sh
 create mode 100755 prepare_rcasteltrione.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java

diff --git a/calculate_average_rcasteltrione.sh b/calculate_average_rcasteltrione.sh
new file mode 100755
index 000000000..e68a2482b
--- /dev/null
+++ b/calculate_average_rcasteltrione.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="--enable-preview"
+time java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_rcasteltrione
diff --git a/prepare_rcasteltrione.sh b/prepare_rcasteltrione.sh
new file mode 100755
index 000000000..f83a3ff69
--- /dev/null
+++ b/prepare_rcasteltrione.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java b/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java
new file mode 100644
index 000000000..d7d93e548
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java
@@ -0,0 +1,309 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.*;
+
+import static java.util.stream.Collectors.toMap;
+
+//baseline: 266s
+
+public class CalculateAverage_rcasteltrione {
+    private static final String FILE = "./measurements.txt";
+    // private static final String FILE = "./backup/measurements.txt";
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        Path path = Paths.get(FILE);
+        Instant start = Instant.now();
+
+        var segList = FileSegment.forFile(path, Runtime.getRuntime().availableProcessors());
+        var results = new ByteArrayToMeasurementMap[segList.size()];
+        var threads = new Thread[segList.size()];
+        try (var channel = FileChannel.open(path, StandardOpenOption.READ)) {
+            for (int i = 0; i < segList.size(); i++) {
+                int finalI = i;
+                FileSegment fileSegment = segList.get(finalI);
+                var t = Thread.ofPlatform().start(() -> results[finalI] = processSegment(channel, fileSegment));
+                threads[i] = t;
+            }
+            for (Thread thread : threads) {
+                thread.join();
+            }
+        }
+
+        Map<String, Measurement> aggregatedMap = Arrays.stream(results)
+                .flatMap(m -> m.entries().stream())
+                .collect(toMap(
+                        ByteArrayToMeasurementMap.Entry::key,
+                        ByteArrayToMeasurementMap.Entry::value,
+                        Measurement::merge,
+                        TreeMap::new));
+
+        System.out.println(aggregatedMap);
+        // System.out.println(Duration.between(start, Instant.now()).toMillis());
+    }
+
+    private static ByteArrayToMeasurementMap processSegment(FileChannel channel, FileSegment seg) {
+        try {
+            MappedByteBuffer mbb = channel.map(FileChannel.MapMode.READ_ONLY, seg.start(), seg.size());
+            byte b;
+            var result = new ByteArrayToMeasurementMap();
+            var lineBuffer = new byte[1 << 13];
+            var segmentPosition = mbb.position();
+            var limit = mbb.limit();
+            var lastLineOffset = 0;
+
+            while (segmentPosition < mbb.limit()) {
+
+                int remaining = limit - segmentPosition;
+                int chunk = Math.min(remaining, lineBuffer.length);
+                mbb.get(segmentPosition, lineBuffer, 0, chunk);
+                for (int i = chunk - 1; i >= 0; i--) {
+                    if (lineBuffer[i] == '\n') {
+                        lastLineOffset = i;
+                        break;
+                    }
+                }
+                for (int lineBufferOffset = 0; lineBufferOffset < lastLineOffset;) {
+                    int nameHash = 0;
+                    int nameLength = 0;
+                    int nameStart = lineBufferOffset;
+                    while ((b = lineBuffer[lineBufferOffset++]) != ';') {
+                        nameHash = 31 * nameHash + b;
+                        nameLength++;
+                    }
+
+                    int temp;
+                    int negative = 1;
+                    // var s = new String(Arrays.copyOfRange(lineBuffer, nameStart, lineOffset - 1), StandardCharsets.UTF_8);
+                    if (lineBuffer[lineBufferOffset] == '-') {
+                        lineBufferOffset++;
+                        negative = -1;
+                    }
+
+                    // Temperature value: non-null double between -99.9 (inclusive) and 99.9 (inclusive), always with one fractional digit
+                    if (lineBuffer[lineBufferOffset + 1] == '.') {
+                        temp = (lineBuffer[lineBufferOffset] - '0') * 10 + (lineBuffer[lineBufferOffset + 2] - '0');
+                        lineBufferOffset += 3;
+                    }
+                    else {
+                        temp = (lineBuffer[lineBufferOffset] - '0') * 100
+                                + (lineBuffer[lineBufferOffset + 1] - '0') * 10
+                                + (lineBuffer[lineBufferOffset + 3] - '0');
+                        lineBufferOffset += 4;
+                    }
+                    if (lineBuffer[lineBufferOffset] == '\r') {
+                        lineBufferOffset++;
+                    }
+                    lineBufferOffset++;
+
+                    temp *= negative;
+                    result.mergeOrCreate(lineBuffer, nameStart, nameLength, nameHash, temp);
+                    // segmentPosition += lineOffset;
+                    // i += lineoffset;
+                }
+
+                segmentPosition += lastLineOffset + 1;
+
+            }
+
+            return result;
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    record FileSegment(long start, long size) {
+        public static List<FileSegment> forFile(Path file, int desiredSegmentsCount) throws IOException {
+            try (var raf = new RandomAccessFile(file.toFile(), "r")) {
+                var segments = new ArrayList<FileSegment>();
+                var fileSize = raf.length();
+                if (fileSize < 1000000) {
+                    return Collections.singletonList(new FileSegment(0, fileSize));
+                }
+                var segmentSize = fileSize / desiredSegmentsCount;
+                for (int segmentIdx = 0; segmentIdx < desiredSegmentsCount; segmentIdx++) {
+                    var segStart = segmentIdx * segmentSize;
+                    var segEnd = (segmentIdx == desiredSegmentsCount - 1) ? fileSize : segStart + segmentSize;
+                    segStart = findSegmentBoundary(raf, segmentIdx, 0, segStart, segEnd);
+                    segEnd = findSegmentBoundary(raf, segmentIdx, desiredSegmentsCount - 1, segEnd, fileSize);
+
+                    var segSize = segEnd - segStart;
+
+                    segments.add(new FileSegment(segStart, segSize));
+                }
+                return segments;
+            }
+        }
+
+        private static long findSegmentBoundary(RandomAccessFile raf, int i, int skipForSegment, long location, long fileSize) throws IOException {
+            if (i == skipForSegment) return location;
+
+            raf.seek(location);
+            while (location < fileSize) {
+                location++;
+                if (raf.read() == '\n') break;
+            }
+            return location;
+        }
+    }
+
+    static class Measurement {
+        int min, max, n;
+        long sum;
+
+        private Measurement(int min, int max, long sum, int n) {
+            this.min = min;
+            this.max = max;
+            this.sum = sum;
+            this.n = n;
+        }
+
+        public Measurement(int temp) {
+            this(temp, temp, temp, 1);
+        }
+
+        final Measurement merge(Measurement other) {
+            this.min = Math.min(other.min, this.min);
+            this.max = Math.max(other.max, this.max);
+            this.sum += other.sum;
+            this.n += other.n;
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            return STR."\{round(min)}/\{round(((double) sum / n))}/\{round(max)}";
+        }
+
+        double round(double v) {
+            return Math.round(v) / 10.0;
+        }
+    }
+
+    static class ByteArrayToMeasurementMap {
+
+        public static final int DEFAULT_CAPACITY = 1024;
+        public static final float LOAD_FACTOR = 0.75f;
+        MeasurementSlot[] slots = new MeasurementSlot[DEFAULT_CAPACITY];
+        int threshold = (int) (DEFAULT_CAPACITY * LOAD_FACTOR);
+        int size = 0;
+
+        private record MeasurementSlot(int hash, byte[] key, String city, Measurement measurement) {
+        }
+
+        public final void mergeOrCreate(byte[] line, int nameStart, int nameLength, int hash, int temperature) {
+            int hashMask = slots.length - 1;
+
+            for (int idx = hash & hashMask;; idx = (idx + 1) & hashMask) {
+                MeasurementSlot slot = slots[idx];
+                if (slot == null) {
+                    size++;
+                    if (size > threshold) {
+                        idx = resize(hash);
+                    }
+                    byte[] nameBuffer = new byte[nameLength];
+                    System.arraycopy(line, nameStart, nameBuffer, 0, nameLength);
+                    slots[idx] = new MeasurementSlot(
+                            hash,
+                            nameBuffer,
+                            new String(nameBuffer, StandardCharsets.UTF_8),
+                            new Measurement(temperature));
+                    return;
+                }
+
+                if (slot.hash == hash && arrayEquals(slot.key, line, nameStart, nameLength)) {
+                    Measurement value = slots[idx].measurement;
+                    value.min = Math.min(value.min, temperature);
+                    value.max = Math.max(value.max, temperature);
+                    value.sum += temperature;
+                    value.n++;
+                    return;
+                }
+            }
+        }
+
+        private int resize(int hash) {
+            var oldSlots = slots;
+            var newSlots = new MeasurementSlot[oldSlots.length << 1];
+            var mask = newSlots.length - 1;
+            for (MeasurementSlot oldSlot : oldSlots) {
+                if (oldSlot == null) {
+                    continue;
+                }
+                int idx = oldSlot.hash & mask;
+                while (newSlots[idx] != null) {
+                    idx = (idx + 1) & mask;
+                }
+                newSlots[idx] = oldSlot;
+            }
+
+            slots = newSlots;
+            threshold = (int) (newSlots.length * LOAD_FACTOR);
+            int hashMask = slots.length - 1;
+            int idx;
+            for (idx = hash & hashMask; slots[idx] != null; idx = (idx + 1) & hashMask) {
+            }
+            return idx;
+        }
+
+        private boolean arrayEquals(byte[] storedKey, byte[] line, int nameStart, int nameLength) {
+            if (storedKey.length != nameLength) {
+                return false;
+            }
+
+            for (int i = 0; i < storedKey.length; i++) {
+                if (storedKey[i] != line[nameStart + i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        private static int hashCode(int h) {
+            h ^= (h >>> 20) ^ (h >>> 12);
+            h ^= (h >>> 7) ^ (h >>> 4);
+            h += h << 7;
+            return h;
+        }
+
+        public final List<Entry> entries() {
+            var result = new ArrayList<Entry>(slots.length);
+            for (MeasurementSlot slot : slots) {
+                if (slot != null) {
+                    result.add(new Entry(slot.city, slot.measurement));
+                }
+            }
+            return result;
+        }
+
+        public record Entry(String key, Measurement value) {
+        }
+
+    }
+
+}

From 5092eb44d1962671b57dcbdc65530e598ca38853 Mon Sep 17 00:00:00 2001
From: Hieu Dao Quang <63568218+dqhieuu@users.noreply.github.com>
Date: Sat, 27 Jan 2024 20:49:59 +0700
Subject: [PATCH 141/268] First attempt with Java-managed concurrency (#590)

Co-authored-by: Quang Hieu Dao <hieu_dq@flinters.vn>
---
 calculate_average_dqhieuu.sh                  |  19 +++
 prepare_dqhieuu.sh                            |  20 +++
 .../onebrc/CalculateAverage_dqhieuu.java      | 117 ++++++++++++++++++
 3 files changed, 156 insertions(+)
 create mode 100755 calculate_average_dqhieuu.sh
 create mode 100755 prepare_dqhieuu.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_dqhieuu.java

diff --git a/calculate_average_dqhieuu.sh b/calculate_average_dqhieuu.sh
new file mode 100755
index 000000000..f0d7fd7ad
--- /dev/null
+++ b/calculate_average_dqhieuu.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_dqhieuu
diff --git a/prepare_dqhieuu.sh b/prepare_dqhieuu.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_dqhieuu.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_dqhieuu.java b/src/main/java/dev/morling/onebrc/CalculateAverage_dqhieuu.java
new file mode 100644
index 000000000..8c155773e
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_dqhieuu.java
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+public class CalculateAverage_dqhieuu {
+    private static final String FILE = "measurements.txt";
+
+    private static double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+
+    private static class MeasurementAggregator {
+        private Lock lock = new ReentrantLock();
+        private double min = Double.POSITIVE_INFINITY;
+        private double max = Double.NEGATIVE_INFINITY;
+        private double sum = 0;
+        private int count = 0;
+
+        @Override
+        public String toString() {
+            return round(min) + "/" + round(round(sum) / count) + "/" + round(max);
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        var lineStream = Files.lines(Paths.get(FILE)).parallel();
+
+        Map<String, MeasurementAggregator> measurements = new ConcurrentHashMap<>(10_000);
+
+        lineStream.forEach(
+                l -> {
+                    var sepIdx = 0;
+                    while (l.charAt(sepIdx) != ';') {
+                        sepIdx++;
+                    }
+
+                    var station = l.substring(0, sepIdx);
+
+                    int valueInt = 0;
+                    int sign = l.charAt(sepIdx + 1) == '-' ? -1 : 1;
+
+                    var lineLength = l.length();
+                    for (var i = sepIdx + 1; i < lineLength; i++) {
+                        var c = l.charAt(i);
+                        if (c == '-' || c == '.') {
+                            continue;
+                        }
+                        valueInt = valueInt * 10 + (c - '0');
+                    }
+
+                    var value = ((double) valueInt / 10.0) * sign;
+
+                    var agg = measurements.computeIfAbsent(station, k -> new MeasurementAggregator());
+
+                    agg.lock.lock();
+
+                    if (value < agg.min) {
+                        agg.min = value;
+                    }
+                    if (value > agg.max) {
+                        agg.max = value;
+                    }
+                    agg.sum += value;
+                    agg.count++;
+
+                    agg.lock.unlock();
+                });
+
+        Map<String, MeasurementAggregator> sortedEntries = new TreeMap<>(measurements);
+
+        var res = new StringBuilder();
+        res.append("{");
+
+        var first = true;
+        for (var entry : sortedEntries.entrySet()) {
+            if (first) {
+                first = false;
+            }
+            else {
+                res.append(", ");
+            }
+
+            var k = entry.getKey();
+            var v = entry.getValue();
+
+            res.append(k);
+            res.append('=');
+            res.append(v);
+        }
+
+        res.append("}");
+
+        System.out.println(res);
+    }
+}

From c228633b5753e2565e93833cf0ef8af23c66ac77 Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Sat, 27 Jan 2024 22:54:43 +0900
Subject: [PATCH 142/268] improve hard disk access locality, another 8% (#591)

* improve hard disk access locality, another 8%

* add some comments & credit

* fixed format
---
 .../onebrc/CalculateAverage_abeobk.java       | 327 +++++++++---------
 1 file changed, 172 insertions(+), 155 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index ed859f3df..06cbc1748 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -28,18 +28,21 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
 
 import sun.misc.Unsafe;
 
 public class CalculateAverage_abeobk {
     private static final boolean SHOW_ANALYSIS = false;
+    private static final int CPU_CNT = Runtime.getRuntime().availableProcessors();
 
     private static final String FILE = "./measurements.txt";
     private static final int BUCKET_SIZE = 1 << 16;
     private static final int BUCKET_MASK = BUCKET_SIZE - 1;
     private static final int MAX_STR_LEN = 100;
     private static final int MAX_STATIONS = 10000;
+    private static final long CHUNK_SZ = 1 << 22; // 4MB chunk
     private static final Unsafe UNSAFE = initUnsafe();
     private static final long[] HASH_MASKS = new long[]{
             0x0L,
@@ -52,6 +55,11 @@ public class CalculateAverage_abeobk {
             0xffffffffffffffL,
             0xffffffffffffffffL, };
 
+    private static AtomicInteger chunk_id = new AtomicInteger(0);
+    private static int chunk_cnt;
+    private static long start_addr, end_addr;
+    private static Stat[][] all_res;
+
     private static final void debug(String s, Object... args) {
         System.out.println(String.format(s, args));
     }
@@ -153,20 +161,6 @@ final boolean contentEquals(long other_addr, long other_word0, long other_tail,
         }
     }
 
-    // split into chunks
-    static long[] slice(long start_addr, long end_addr, long chunk_size, int cpu_cnt) {
-        long[] ptrs = new long[cpu_cnt + 1];
-        ptrs[0] = start_addr;
-        for (int i = 1; i < cpu_cnt; i++) {
-            long addr = start_addr + i * chunk_size;
-            while (addr < end_addr && UNSAFE.getByte(addr++) != '\n')
-                ;
-            ptrs[i] = Math.min(addr, end_addr);
-        }
-        ptrs[cpu_cnt] = end_addr;
-        return ptrs;
-    }
-
     // idea from royvanrijn
     static final long getSemiPosCode(final long word) {
         long xor_semi = word ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
@@ -189,123 +183,158 @@ static final short parseNum(long num_word, int dot_pos) {
         return (short) ((abs_val ^ signed) - signed);
     }
 
-    // optimize for contest
-    // save as much slow memory access as possible
-    // about 50% key < 8chars, 25% key bettween 8-10 chars
-    // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
-    static final Node[] parse(int thread_id, long start, long end) {
-        int cls = 0;
-        long addr = start;
-        var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
-        // parse loop
-        while (addr < end) {
-            long row_addr = addr;
-
-            long word0 = UNSAFE.getLong(addr);
-            long semipos_code = getSemiPosCode(word0);
-
-            // about 50% chance key < 8 chars
-            if (semipos_code != 0) {
-                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos + 1;
-                long num_word = UNSAFE.getLong(addr);
-                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                addr += (dot_pos >>> 3) + 3;
-
-                long tail = word0 & HASH_MASKS[semi_pos];
-                int bucket = xxh32(tail) & BUCKET_MASK;
-                short val = parseNum(num_word, dot_pos);
-
-                while (true) {
-                    var node = map[bucket];
-                    if (node == null) {
-                        map[bucket] = new Node(row_addr, tail, val);
-                        break;
+    // Thread pool worker
+    static final class Worker extends Thread {
+        final int thread_id;
+
+        Worker(int i) {
+            thread_id = i;
+            this.start();
+        }
+
+        @Override
+        public void run() {
+            var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
+            int cnt = 0;
+            int id;
+            int cls = 0;
+
+            // process in small chunk to maintain disk locality (artsiomkorzun trick)
+            // but keep going instead of merging
+            while ((id = chunk_id.getAndIncrement()) < chunk_cnt) {
+                long addr = start_addr + id * CHUNK_SZ;
+                long end = Math.min(addr + CHUNK_SZ, end_addr);
+                // adjust start
+                if (id > 0) {
+                    while (UNSAFE.getByte(addr++) != '\n')
+                        ;
+                }
+
+                // parse loop
+                // optimize for contest
+                // save as much slow memory access as possible
+                // about 50% key < 8chars, 25% key bettween 8-10 chars
+                // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
+                while (addr < end) {
+                    long row_addr = addr;
+
+                    long word0 = UNSAFE.getLong(addr);
+                    long semipos_code = getSemiPosCode(word0);
+
+                    // about 50% chance key < 8 chars
+                    if (semipos_code != 0) {
+                        int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                        addr += semi_pos + 1;
+                        long num_word = UNSAFE.getLong(addr);
+                        int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                        addr += (dot_pos >>> 3) + 3;
+
+                        long tail = word0 & HASH_MASKS[semi_pos];
+                        int bucket = xxh32(tail) & BUCKET_MASK;
+                        short val = parseNum(num_word, dot_pos);
+
+                        while (true) {
+                            var node = map[bucket];
+                            if (node == null) {
+                                map[bucket] = new Node(row_addr, tail, val);
+                                cnt++;
+                                break;
+                            }
+                            if (node.tail == tail) {
+                                node.add(val);
+                                break;
+                            }
+                            bucket++;
+                            if (SHOW_ANALYSIS)
+                                cls++;
+                        }
+                        continue;
                     }
-                    if (node.tail == tail) {
-                        node.add(val);
-                        break;
+
+                    addr += 8;
+                    long word = UNSAFE.getLong(addr);
+                    semipos_code = getSemiPosCode(word);
+                    // 43% chance
+                    if (semipos_code != 0) {
+                        int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                        addr += semi_pos + 1;
+                        long num_word = UNSAFE.getLong(addr);
+                        int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                        addr += (dot_pos >>> 3) + 3;
+
+                        long tail = (word & HASH_MASKS[semi_pos]);
+                        int bucket = xxh32(word0 ^ tail) & BUCKET_MASK;
+                        short val = parseNum(num_word, dot_pos);
+
+                        while (true) {
+                            var node = map[bucket];
+                            if (node == null) {
+                                map[bucket] = new Node(row_addr, word0, tail, val);
+                                cnt++;
+                                break;
+                            }
+                            if (node.word0 == word0 && node.tail == tail) {
+                                node.add(val);
+                                break;
+                            }
+                            bucket++;
+                            if (SHOW_ANALYSIS)
+                                cls++;
+                        }
+                        continue;
                     }
-                    bucket++;
-                    if (SHOW_ANALYSIS)
-                        cls++;
-                }
-                continue;
-            }
 
-            addr += 8;
-            long word = UNSAFE.getLong(addr);
-            semipos_code = getSemiPosCode(word);
-            // 43% chance
-            if (semipos_code != 0) {
-                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos + 1;
-                long num_word = UNSAFE.getLong(addr);
-                int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                addr += (dot_pos >>> 3) + 3;
-
-                long tail = (word & HASH_MASKS[semi_pos]);
-                int bucket = xxh32(word0 ^ tail) & BUCKET_MASK;
-                short val = parseNum(num_word, dot_pos);
-
-                while (true) {
-                    var node = map[bucket];
-                    if (node == null) {
-                        map[bucket] = new Node(row_addr, word0, tail, val);
-                        break;
+                    // why not going for more? tested, slower
+                    long hash = word0;
+                    while (semipos_code == 0) {
+                        hash ^= word;
+                        addr += 8;
+                        word = UNSAFE.getLong(addr);
+                        semipos_code = getSemiPosCode(word);
                     }
-                    if (node.word0 == word0 && node.tail == tail) {
-                        node.add(val);
-                        break;
+
+                    int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                    addr += semi_pos;
+                    int keylen = (int) (addr - row_addr);
+                    long num_word = UNSAFE.getLong(addr + 1);
+                    int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+                    addr += (dot_pos >>> 3) + 4;
+
+                    long tail = (word & HASH_MASKS[semi_pos]);
+                    int bucket = xxh32(hash ^ tail) & BUCKET_MASK;
+                    short val = parseNum(num_word, dot_pos);
+
+                    while (true) {
+                        var node = map[bucket];
+                        if (node == null) {
+                            map[bucket] = new Node(row_addr, word0, tail, val);
+                            cnt++;
+                            break;
+                        }
+                        if (node.contentEquals(row_addr, word0, tail, keylen)) {
+                            node.add(val);
+                            break;
+                        }
+                        bucket++;
+                        if (SHOW_ANALYSIS)
+                            cls++;
                     }
-                    bucket++;
-                    if (SHOW_ANALYSIS)
-                        cls++;
                 }
-                continue;
             }
 
-            // why not going for more? tested, slower
-
-            long hash = word0;
-            while (semipos_code == 0) {
-                hash ^= word;
-                addr += 8;
-                word = UNSAFE.getLong(addr);
-                semipos_code = getSemiPosCode(word);
+            if (SHOW_ANALYSIS) {
+                debug("Thread %d collision = %d", thread_id, cls);
             }
 
-            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-            addr += semi_pos;
-            int keylen = (int) (addr - row_addr);
-            long num_word = UNSAFE.getLong(addr + 1);
-            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-            addr += (dot_pos >>> 3) + 4;
-
-            long tail = (word & HASH_MASKS[semi_pos]);
-            int bucket = xxh32(hash ^ tail) & BUCKET_MASK;
-            short val = parseNum(num_word, dot_pos);
-
-            while (true) {
-                var node = map[bucket];
-                if (node == null) {
-                    map[bucket] = new Node(row_addr, word0, tail, val);
-                    break;
-                }
-                if (node.contentEquals(row_addr, word0, tail, keylen)) {
-                    node.add(val);
-                    break;
+            Stat[] stats = new Stat[cnt];
+            int i = 0;
+            for (var node : map) {
+                if (node != null) {
+                    stats[i++] = new Stat(node);
                 }
-                bucket++;
-                if (SHOW_ANALYSIS)
-                    cls++;
             }
+            all_res[thread_id] = stats;
         }
-
-        if (SHOW_ANALYSIS) {
-            debug("Thread %d collision = %d", thread_id, cls);
-        }
-        return map;
     }
 
     // thomaswue trick
@@ -329,44 +358,32 @@ public static void main(String[] args) throws InterruptedException, IOException
             return;
         }
 
-        try (var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
-            long start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
-            long file_size = file.size();
-            long end_addr = start_addr + file_size;
-
-            // only use all cpus on large file
-            int cpu_cnt = file_size < 1e6 ? 1 : Runtime.getRuntime().availableProcessors();
-            long chunk_size = Math.ceilDiv(file_size, cpu_cnt);
-
-            // processing
-            var ptrs = slice(start_addr, end_addr, chunk_size, cpu_cnt);
-
-            List<List<Stat>> maps = IntStream.range(0, cpu_cnt)
-                    .mapToObj(thread_id -> parse(thread_id, ptrs[thread_id], ptrs[thread_id + 1]))
-                    .map(map -> {
-                        List<Stat> stats = new ArrayList<>();
-                        for (var node : map) {
-                            if (node == null)
-                                continue;
-                            stats.add(new Stat(node));
-                        }
-                        return stats;
-                    })
-                    .parallel()
-                    .toList();
-
-            TreeMap<String, Stat> ms = new TreeMap<>();
-            for (var stats : maps) {
-                for (var s : stats) {
-                    var stat = ms.putIfAbsent(s.key, s);
-                    if (stat != null)
-                        stat.node.merge(s.node);
-                }
+        var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
+        long file_size = file.size();
+        start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
+        end_addr = start_addr + file_size;
+
+        // only use all cpus on large file
+        int cpu_cnt = file_size < 1e6 ? 1 : CPU_CNT;
+        chunk_cnt = (int) Math.ceilDiv(file_size, CHUNK_SZ);
+        all_res = new Stat[cpu_cnt][];
+
+        List<Worker> workers = IntStream.range(0, cpu_cnt).mapToObj(i -> new Worker(i)).toList();
+        for (var w : workers)
+            w.join();
+
+        // collect all results
+        TreeMap<String, Stat> ms = new TreeMap<>();
+        for (var res : all_res) {
+            for (var s : res) {
+                var stat = ms.putIfAbsent(s.key, s);
+                if (stat != null)
+                    stat.node.merge(s.node);
             }
-
-            // print result
-            System.out.println(ms);
-            System.out.close();
         }
+
+        // print output
+        System.out.println(ms);
+        System.out.close();
     }
 }
\ No newline at end of file

From f9c58414da2c647e6d7df5cf49aa21cd30684f38 Mon Sep 17 00:00:00 2001
From: Roman Musin <995612+roman-r-m@users.noreply.github.com>
Date: Sat, 27 Jan 2024 14:17:55 +0000
Subject: [PATCH 143/268] Next version (#596)

* cleanup prepare script

* native image options

* fix quardaric probing (no change to perf)

* mask to get the last chunk of the name

* extract hash functions

* tweak the probing loop (-100ms)

* fiddle with native image options

* Reorder conditions in hope it makes branch predictor happier

* extracted constant
---
 calculate_average_roman-r-m.sh                | 21 ++---
 prepare_roman-r-m.sh                          |  2 +
 .../onebrc/CalculateAverage_roman_r_m.java    | 78 ++++++++++++-------
 3 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/calculate_average_roman-r-m.sh b/calculate_average_roman-r-m.sh
index acf9864ca..5ba132f12 100755
--- a/calculate_average_roman-r-m.sh
+++ b/calculate_average_roman-r-m.sh
@@ -15,21 +15,16 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview -XX:+UseTransparentHugePages"
-
-# epsilon GC needs enough memory or it makes things worse
-# see https://stackoverflow.com/questions/58087596/why-are-repeated-memory-allocations-observed-to-be-slower-using-epsilon-vs-g1
-JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xmx1G -Xms1G -XX:+AlwaysPreTouch"
-
 if [ -f target/CalculateAverage_roman_r_m_image ]; then
-    echo "Picking up existing native image 'target/CalculateAverage_roman_r_m_image', delete the file to select JVM mode." 1>&2
+    echo "Running native image 'target/CalculateAverage_roman_r_m_image'." 1>&2
     target/CalculateAverage_roman_r_m_image
 else
-    JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
-    if [[ ! "$(uname -s)" = "Darwin" ]]; then
-        # On OS/X, my machine, this errors:
-        JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
-    fi
-    echo "Choosing to run the app in JVM mode as no native image was found, use additional_build_step_roman_r_m.sh to generate." 1>&2
+    JAVA_OPTS="--enable-preview -XX:+UseTransparentHugePages"
+    JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
+    # epsilon GC needs enough memory or it makes things worse
+    # see https://stackoverflow.com/questions/58087596/why-are-repeated-memory-allocations-observed-to-be-slower-using-epsilon-vs-g1
+    JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xmx1G -Xms1G -XX:+AlwaysPreTouch"
+
+    echo "Running on JVM" 1>&2
     java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_roman_r_m
 fi
diff --git a/prepare_roman-r-m.sh b/prepare_roman-r-m.sh
index a0593b2cf..dcd5500df 100755
--- a/prepare_roman-r-m.sh
+++ b/prepare_roman-r-m.sh
@@ -23,6 +23,8 @@ if [ ! -f target/CalculateAverage_roman_r_m_image ]; then
 
     JAVA_OPTS="--enable-preview -dsa"
     NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_roman_r_m --gc=epsilon -Ob -O3 -march=native --strict-image-heap $JAVA_OPTS"
+    NATIVE_IMAGE_OPTS="$NATIVE_IMAGE_OPTS -R:MaxHeapSize=128m"
+    NATIVE_IMAGE_OPTS="$NATIVE_IMAGE_OPTS -H:+UnlockExperimentalVMOptions -H:-GenLoopSafepoints -H:InlineAllBonus=10 -H:-ParseRuntimeOptions"
 
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_roman_r_m_image dev.morling.onebrc.CalculateAverage_roman_r_m
 fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
index 896616d02..7529e8ad8 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java
@@ -40,6 +40,7 @@ private static long broadcast(byte b) {
     private static final long SEMICOLON_MASK = broadcast((byte) ';');
     private static final long LINE_END_MASK = broadcast((byte) '\n');
     private static final long DOT_MASK = broadcast((byte) '.');
+    private static final long ZEROES_MASK = broadcast((byte) '0');
 
     // from netty
 
@@ -64,6 +65,15 @@ static long nextNewline(long from, MemorySegment ms) {
         return start + Long.numberOfTrailingZeros(i) / 8;
     }
 
+    static int hashFull(long word) {
+        return (int) (word ^ (word >>> 32));
+    }
+
+    static int hashPartial(long word, int bytes) {
+        long h = Long.reverseBytes(word) >>> (8 * (8 - bytes));
+        return (int) (h ^ (h >>> 32));
+    }
+
     public static void main(String[] args) throws Exception {
         Field f = Unsafe.class.getDeclaredField("theUnsafe");
         f.setAccessible(true);
@@ -96,34 +106,37 @@ public static void main(String[] args) throws Exception {
                         var station = new ByteString(segment);
                         long offset = segment.address();
                         long end = offset + segment.byteSize();
+                        long tailMask;
                         while (offset < end) {
                             // parsing station name
                             long start = offset;
                             long next = UNSAFE.getLong(offset);
                             long pattern = applyPattern(next, SEMICOLON_MASK);
                             int bytes;
-                            if (pattern != 0) {
+                            if (pattern == 0) {
+                                station.hash = hashFull(next);
+                                do {
+                                    offset += 8;
+                                    next = UNSAFE.getLong(offset);
+                                    pattern = applyPattern(next, SEMICOLON_MASK);
+                                } while (pattern == 0);
+
                                 bytes = Long.numberOfTrailingZeros(pattern) / 8;
                                 offset += bytes;
-                                long h = Long.reverseBytes(next) >>> (8 * (8 - bytes));
-                                station.hash = (int) (h ^ (h >>> 32));
+                                tailMask = ((1L << (8 * bytes)) - 1);
                             }
                             else {
-                                long h = next;
-                                station.hash = (int) (h ^ (h >>> 32));
-                                while (pattern == 0) {
-                                    offset += 8;
-                                    next = UNSAFE.getLong(offset);
-                                    pattern = applyPattern(next, SEMICOLON_MASK);
-                                }
                                 bytes = Long.numberOfTrailingZeros(pattern) / 8;
                                 offset += bytes;
+                                tailMask = ((1L << (8 * bytes)) - 1);
+
+                                station.hash = hashPartial(next, bytes);
                             }
 
                             int len = (int) (offset - start);
                             station.offset = start;
                             station.len = len;
-                            station.tail = next & ((1L << (8 * bytes)) - 1);
+                            station.tail = next & tailMask;
 
                             offset++;
 
@@ -140,7 +153,7 @@ public static void main(String[] args) throws Exception {
                                 long numLen = applyPattern(encodedVal, DOT_MASK);
                                 numLen = Long.numberOfTrailingZeros(numLen) / 8;
 
-                                encodedVal ^= broadcast((byte) 0x30);
+                                encodedVal ^= ZEROES_MASK;
 
                                 int intPart = (int) (encodedVal & ((1 << (8 * numLen)) - 1));
                                 intPart <<= 8 * (2 - numLen);
@@ -285,24 +298,31 @@ void update(ByteString s, int value) {
             int h = s.hashCode();
             int idx = (SIZE - 1) & h;
 
+            var keys = this.keys;
+
+            int idx0 = idx;
             int i = 0;
-            while (keys[idx] != null && !keys[idx].equals(s)) {
-                i++;
-                idx = (idx + i * i) % SIZE;
-            }
-            if (keys[idx] == null) {
-                keys[idx] = s.copy();
-                values[idx] = new int[4];
-                values[idx][0] = value;
-                values[idx][1] = value;
-                values[idx][2] = value;
-                values[idx][3] = 1;
-            }
-            else {
-                values[idx][0] = Math.min(values[idx][0], value);
-                values[idx][1] = Math.max(values[idx][1], value);
-                values[idx][2] += value;
-                values[idx][3] += 1;
+            while (true) {
+                if (keys[idx] != null && keys[idx].equals(s)) {
+                    values[idx][0] = Math.min(values[idx][0], value);
+                    values[idx][1] = Math.max(values[idx][1], value);
+                    values[idx][2] += value;
+                    values[idx][3] += 1;
+                    return;
+                }
+                else if (keys[idx] == null) {
+                    keys[idx] = s.copy();
+                    values[idx] = new int[4];
+                    values[idx][0] = value;
+                    values[idx][1] = value;
+                    values[idx][2] = value;
+                    values[idx][3] = 1;
+                    return;
+                }
+                else {
+                    i++;
+                    idx = (idx0 + i * i) % SIZE;
+                }
             }
         }
 

From 84f6331b835dd2f1c74c49ca9a01c63e87779fda Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Sat, 27 Jan 2024 16:20:02 +0200
Subject: [PATCH 144/268] 1BRC gigiblender (#595)

* Dirty implementation gigiblender

* Final impl gigiblender
---
 calculate_average_gigiblender.sh              |  19 +
 .../onebrc/CalculateAverage_gigiblender.java  | 501 ++++++++++++++++++
 2 files changed, 520 insertions(+)
 create mode 100755 calculate_average_gigiblender.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java

diff --git a/calculate_average_gigiblender.sh b/calculate_average_gigiblender.sh
new file mode 100755
index 000000000..7d51bdc36
--- /dev/null
+++ b/calculate_average_gigiblender.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gigiblender
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java
new file mode 100644
index 000000000..162d71209
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java
@@ -0,0 +1,501 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.TreeMap;
+
+public class CalculateAverage_gigiblender {
+    private static final int AVAIL_CORES = Runtime.getRuntime().availableProcessors();
+    private static final HashTable[] tables = new HashTable[AVAIL_CORES];
+
+    private static Unsafe unsafe;
+    static {
+        Field theUnsafe = null;
+        try {
+            theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            unsafe = (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (IllegalAccessException | NoSuchFieldException ignored) {
+        }
+    }
+
+    private static final String FILE = "./measurements.txt";
+
+    static class HashTable {
+
+        // 10_000 unique hashes ->
+        private static final int ENTRY_SIZE = 32;
+        private static final int NUM_ENTRIES = 16384;
+        private static final int DATA_SIZE = NUM_ENTRIES * ENTRY_SIZE;
+
+        /*
+         * data[i -> i + 7] = 8 bytes hash
+         * data[i + 8 -> i + 15] = 7 bytes masked address of the string in the file. 1 byte for the length of the string
+         * data[i + 16 -> i + 19] = 4 bytes count
+         * data[i + 20 -> i + 21] = 2 bytes max
+         * data[i + 22 -> i + 23] = 2 bytes min -- sign preserved
+         * data[i + 24 -> i + 31] = 8 bytes sum
+         */
+        byte[] data;
+
+        private static final int HASH_OFFSET = 0;
+
+        private static final int ADDR_OFFSET = 8;
+        private static final long ADDR_MASK = 0x00FFFFFFFFFFFFFFL;
+        private static final int STRING_LENGTH_SHIFT = 56;
+
+        private static final int COUNT_OFFSET = 16;
+
+        private static final int SUM_OFFSET = 24;
+
+        private int reprobe_count;
+
+        public HashTable() {
+            data = new byte[DATA_SIZE];
+            // reprobe_count = 0;
+        }
+
+        private long string_addr_and_length(long hash) {
+            return unsafe.getLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + hash + ADDR_OFFSET);
+        }
+
+        private static long string_addr(long encoded_str_addr) {
+            return (encoded_str_addr & ADDR_MASK);
+        }
+
+        private static long string_length(long encoded_str_addr) {
+            return encoded_str_addr >>> STRING_LENGTH_SHIFT;
+        }
+
+        private long count_max_min(long hash) {
+            return unsafe.getLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + hash + COUNT_OFFSET);
+        }
+
+        private static short mask_min(long count_max_min) {
+            // Preserve the sign
+            return (short) (count_max_min >> 6 * Byte.SIZE);
+        }
+
+        private static short mask_max(long count_max_min) {
+            return (short) (count_max_min >>> 4 * Byte.SIZE);
+        }
+
+        private static int mask_count(long count_max_min) {
+            return (int) count_max_min;
+        }
+
+        private static long encode_count_max_min(int count, short max, short min) {
+            return ((long) count) | ((((long) max) & 0xFFFF) << 4 * Byte.SIZE) | (((long) min) << 6 * Byte.SIZE);
+        }
+
+        private long sum(long hash) {
+            return unsafe.getLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + hash + SUM_OFFSET);
+        }
+
+        private static boolean string_equals(long string_addr, long entry_string_addr, int size_bytes) {
+            int remaining_bytes = size_bytes % 8;
+            int i = 0;
+            for (; i < size_bytes - remaining_bytes; i += 8) {
+                long entry_bytes = unsafe.getLong(entry_string_addr + i);
+                long string_bytes = unsafe.getLong(string_addr + i);
+                if (entry_bytes != string_bytes) {
+                    return false;
+                }
+            }
+            // The hash function is not great, so I end up in this case a lot, so I take some risks.
+            // This never caused a SIGSEGV even though it might :) If it does, fall back to the commented version below.
+            // I will try to improve on the hash function
+            if (remaining_bytes != 0) {
+                long entry_bytes = unsafe.getLong(entry_string_addr + i);
+                long string_bytes = unsafe.getLong(string_addr + i);
+                // mask the bytes we care about
+                long mask = (1L << (remaining_bytes * Byte.SIZE)) - 1;
+                entry_bytes &= mask;
+                string_bytes &= mask;
+                return entry_bytes == string_bytes;
+            }
+            // for (; i < size_bytes; i++) {
+            // byte entry_byte = unsafe.getByte(entry_string_addr + i);
+            // byte string_byte = unsafe.getByte(string_addr + i);
+            // if (entry_byte != string_byte) {
+            // return false;
+            // }
+            // }
+            return true;
+        }
+
+        public void insert(long hash, long string_addr, byte string_size, long final_number) {
+            assert string_addr >>> 56 == 0 : String.format("Expected final 8 bytes to be 0, got %s", Long.toBinaryString(string_addr));
+
+            long encoded_string_addr_and_length = string_addr | ((long) string_size << STRING_LENGTH_SHIFT);
+            assert string_addr(encoded_string_addr_and_length) == string_addr : String.format("Expected string addr to be %s, got %s", Long.toHexString(string_addr),
+                    Long.toHexString(string_addr(encoded_string_addr_and_length)));
+            assert string_length(encoded_string_addr_and_length) == string_size
+                    : String.format("Expected string length to be %s, got %s", string_size, string_length(encoded_string_addr_and_length));
+
+            long map_entry = apply_mask(hash * ENTRY_SIZE);
+            while (true) {
+                int entry_count0 = unsafe.getInt(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + COUNT_OFFSET);
+                if (entry_count0 == 0) {
+                    // dump_insert(map_entry, hash, string_addr, string_size, final_number);
+                    // Found an empty slot. Insert the entry here
+                    unsafe.putLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + HASH_OFFSET, hash);
+                    unsafe.putLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + ADDR_OFFSET, encoded_string_addr_and_length);
+                    unsafe.putLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + COUNT_OFFSET, encode_count_max_min(1, (short) final_number, (short) final_number));
+                    unsafe.putLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + SUM_OFFSET, final_number);
+
+                    assert mask_count(encode_count_max_min(1, (short) final_number, (short) final_number)) == 1 : String.format("Expected count to be 1, got %s",
+                            Integer.toBinaryString(mask_count(encode_count_max_min(1, (short) final_number, (short) final_number))));
+                    assert mask_max(encode_count_max_min(1, (short) final_number, (short) final_number)) == (short) final_number
+                            : String.format("Expected max to be %s, got %s", final_number,
+                                    Integer.toBinaryString(mask_max(encode_count_max_min(1, (short) final_number, (short) final_number))));
+                    assert mask_min(encode_count_max_min(1, (short) final_number, (short) final_number)) == (short) final_number
+                            : String.format("Expected min to be %s, got %s", final_number,
+                                    Integer.toBinaryString(mask_min(encode_count_max_min(1, (short) final_number, (short) final_number))));
+                    return;
+                }
+                else {
+                    // Check if strings match. If yes, update. Otherwise, look for the next available slot
+                    long entry_string_addr_and_length = string_addr_and_length(map_entry);
+                    long entry_str_size = string_length(entry_string_addr_and_length);
+
+                    if (string_size != entry_str_size) {
+                        // Strings are not the same size. Continue looking for the next slot
+                        map_entry = apply_mask(map_entry + ENTRY_SIZE);
+                        // reprobe_count++;
+                    }
+                    else {
+                        long entry_string_addr = string_addr(entry_string_addr_and_length);
+                        if (string_equals(string_addr, entry_string_addr, string_size)) {
+                            // Strings are the same. Update the entry
+                            long entry_count_max_min = count_max_min(map_entry);
+                            int entry_count = mask_count(entry_count_max_min);
+                            short entry_max = mask_max(entry_count_max_min);
+                            short entry_min = mask_min(entry_count_max_min);
+
+                            entry_count++;
+                            assert (int) final_number == final_number : String.format("Expected final number to be an int, got %s", final_number);
+                            entry_max = (short) Math.max(entry_max, (int) final_number);
+                            entry_min = (short) Math.min(entry_min, (int) final_number);
+
+                            long entry_sum = sum(map_entry);
+                            entry_sum += final_number;
+
+                            unsafe.putLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + COUNT_OFFSET, encode_count_max_min(entry_count, entry_max, entry_min));
+                            unsafe.putLong(data, Unsafe.ARRAY_BYTE_BASE_OFFSET + map_entry + SUM_OFFSET, entry_sum);
+                            return;
+                        }
+                        else {
+                            // Strings are not the same. Continue looking for the next slot
+                            map_entry = apply_mask(map_entry + ENTRY_SIZE);
+                            // reprobe_count++;
+                        }
+                    }
+                }
+            }
+        }
+
+        private static long apply_mask(long hash) {
+            return hash & (DATA_SIZE - 1);
+        }
+
+        public void update_res(TreeMap<String, Result> result_map) {
+            // System.err.println("Reprobe count: " + reprobe_count);
+            Result r = new Result();
+
+            for (int i = 0; i < NUM_ENTRIES; i++) {
+                long entry_addr_offset = (long) i * ENTRY_SIZE;
+                long entry_count_max_min = count_max_min(entry_addr_offset);
+                int entry_count = mask_count(entry_count_max_min);
+                if (entry_count == 0) {
+                    continue;
+                }
+                long entry_string_addr_and_length = string_addr_and_length(entry_addr_offset);
+                long entry_string_addr = string_addr(entry_string_addr_and_length);
+                long entry_string_length = string_length(entry_string_addr_and_length);
+
+                // no reason to copy the byte array twice here but what can you do...
+                byte[] bytes = new byte[(int) entry_string_length];
+                unsafe.copyMemory(null, entry_string_addr, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, entry_string_length);
+                String s = new String(bytes, StandardCharsets.UTF_8);
+
+                short entry_max = mask_max(entry_count_max_min);
+                short entry_min = mask_min(entry_count_max_min);
+
+                long entry_sum = sum(entry_addr_offset);
+
+                Result ret = result_map.putIfAbsent(s, r);
+                if (ret == null) {
+                    r.count = entry_count;
+                    r.max = entry_max;
+                    r.min = entry_min;
+                    r.sum = entry_sum;
+                    r = new Result();
+                }
+                else {
+                    ret.count += entry_count;
+                    ret.max = (short) Math.max(ret.max, entry_max);
+                    ret.min = (short) Math.min(ret.min, entry_min);
+                    ret.sum += entry_sum;
+                }
+            }
+        }
+
+        public void dump_insert(long map_entry, long hash, long string_addr, byte string_size, long final_number) {
+            System.out.println("START dump_insert");
+            System.out.println("Inserting " + final_number + " with hash " + hash);
+            System.out.println("Map entry: " + map_entry);
+            System.out.println("String addr: " + string_addr + " with length " + string_size);
+            dump(string_addr, string_addr + string_size);
+            System.out.println("END dump_insert");
+        }
+    }
+
+    static class Result {
+        public int count;
+        public short max;
+        public short min;
+        public long sum;
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        @Override
+        public String toString() {
+            return round(min / 10.) + "/" + round(sum / (double) (10 * count)) + "/" + round(max / 10.);
+        }
+    }
+
+    private static void compute_slice(final long base_addr, final long slice_size, final long file_size, final int thread_index) {
+        HashTable my_table;
+        if (!SINGLE_CORE) {
+            my_table = new HashTable();
+            tables[thread_index] = my_table;
+        }
+        else {
+            if (tables[0] == null) {
+                tables[0] = new HashTable();
+            }
+            my_table = tables[0];
+        }
+
+        long cur_addr = base_addr + (long) thread_index * slice_size;
+        // Lookup the next newline. If thread_index == 0 then start right away
+        if (thread_index != 0) {
+            while (unsafe.getByte(cur_addr) != '\n') {
+                cur_addr++;
+            }
+            cur_addr++;
+        }
+
+        long end_addr = base_addr + (long) (thread_index + 1) * slice_size;
+        if (thread_index == (AVAIL_CORES - 1)) {
+            // Last thread. We need to read until the end of the file
+            end_addr = base_addr + file_size;
+        }
+        else {
+            // look ahead for the next newline
+            while (unsafe.getByte(end_addr) != '\n') {
+                end_addr++;
+            }
+            end_addr++;
+        }
+
+        // We now have a well-defined interval [cur_addr, end_addr) to work on
+        long hash = -2346162244362633811L;
+        byte string_size = 0;
+        long string_addr = cur_addr;
+        while (cur_addr < end_addr) {
+            long value_mem = unsafe.getLong(cur_addr);
+            int semicolon_byte_index = get_semicolon_index(value_mem);
+
+            string_size += (byte) semicolon_byte_index;
+
+            // dump(cur_addr, cur_addr + semicolon_byte_index);
+
+            if (semicolon_byte_index != 8) {
+                long value_mem_up_to_semicolon = value_mem & ((1L << (semicolon_byte_index * Byte.SIZE)) - 1);
+
+                // We have a semicolon, so the hash is complete now. We can construct the number
+                // and insert it into the hash table
+                long start_num_addr = cur_addr + semicolon_byte_index + 1;
+
+                // Always read the next 8 bytes for the number. It seems that this is faster than
+                // checking if the whole number is in the current 8 bytes and only reading if it is not
+                long number_mem_value = unsafe.getLong(start_num_addr);
+                long number_len_bytes = get_newline_index(number_mem_value);
+
+                long final_number = extract_number(number_mem_value, number_len_bytes);
+
+                // 0.2421196 % reprobe rate
+                hash = compute_hash(hash ^ value_mem_up_to_semicolon);
+
+                // We have the final number now. We can insert it into the hash table
+                my_table.insert(hash, string_addr, string_size, final_number);
+                // Now we can move on to the next line
+                hash = -2346162244362633811L;
+                string_size = 0;
+                cur_addr = start_num_addr + number_len_bytes + 1;
+                string_addr = cur_addr;
+            }
+            else {
+                // No semicolon in the 8 bytes read. Continue reading
+                hash = hash ^ value_mem;
+                cur_addr += 8;
+            }
+        }
+        assert cur_addr == end_addr : String.format("Expected cur_addr to be %s, got %s", end_addr, cur_addr);
+    }
+
+    private static long extract_number(long number_mem_value, long number_len_bytes) {
+        // Pray for GVN/CSE and Sea of Nodes moving the mess below in the proper places because
+        // I don't want to spend the time to do it properly :)
+        long number_mem_dot_index = get_dot_index(number_mem_value);
+
+        int fractional_part = get_fractional_part(number_mem_value, number_len_bytes);
+        int sign = get_sign(number_mem_value);
+        int skip_sign = skip_sign(number_mem_value);
+
+        long number_mem_value_no_sign = number_mem_value >>> (skip_sign << 3);
+        // Two cases: either there's a single digit before the dot, or there's two
+        // Start from the dot index and go backwards
+        long new_number_mem_dot_index = number_mem_dot_index - skip_sign;
+        long read_byte_mask = 0xFFL << ((new_number_mem_dot_index - 1) * Byte.SIZE);
+        long ones = ((number_mem_value_no_sign & read_byte_mask) >>> ((new_number_mem_dot_index - 1) * Byte.SIZE)) - 0x30;
+        // Should be 0 due to the multiplication if there's only one digit before the dot
+        long tens = ((number_mem_value_no_sign & 0xFFL) - 0x30) * (new_number_mem_dot_index - 1);
+
+        long final_number = (tens * 100 + ones * 10 + fractional_part) * sign;
+        return final_number;
+    }
+
+    private static int get_fractional_part(long number_mem_value, long number_len_bytes) {
+        return (int) ((number_mem_value >>> ((number_len_bytes - 1) * Byte.SIZE)) & 0xFF) - 0x30;
+    }
+
+    private static int skip_sign(long number_mem_value) {
+        // return 1 if char is '-', 0 if it is not
+        long diff = (number_mem_value & 0xFF) - 0x2D;
+        long sign = (diff | -diff) >>> 63;
+        return (int) ((sign - 1) * -1);
+    }
+
+    private static int get_sign(long number_mem_value) {
+        // return 1 if char is not '-', -1 if it is
+        long diff = (number_mem_value & 0xFF) - 0x2D;
+        long sign = (diff | -diff) >>> 63;
+        return (int) (-2 * sign + 1) * -1;
+    }
+
+    private static long compute_hash(long x) { // Hash burrowed from artsiomkorzun and slightly changed
+        long h = x * -7046029254386353131L;
+        long h1 = h ^ (h >>> 32);
+        h = h ^ (h << 32);
+        return h1 ^ h;
+    }
+
+    private static void dump(long startAddr, long endAddr) {
+        byte[] bytes = new byte[(int) (endAddr - startAddr)];
+        unsafe.copyMemory(null, startAddr, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, bytes.length);
+        String s = new String(bytes, StandardCharsets.UTF_8);
+        System.out.println(s);
+        // Dump the bytes to binary form
+        for (byte b : bytes) {
+            System.out.print(Integer.toBinaryString(b & 0xFF));
+            System.out.print(" ");
+        }
+        System.out.println();
+        // Dump the bytes to hex form
+        for (byte b : bytes) {
+            System.out.print(Integer.toHexString(b & 0xFF));
+            System.out.print(" ");
+        }
+        System.out.println();
+    }
+
+    private static int get_byte_0_index(long value) {
+        long res = (value - 0x0101010101010101L) & (~value & 0x8080808080808080L);
+        res = Long.numberOfTrailingZeros(res) >> 3;
+        return (int) res;
+    }
+
+    private static int get_dot_index(long value) {
+        long temp = value ^ 0x2E2E2E2E2E2E2E2EL;
+        return get_byte_0_index(temp);
+    }
+
+    private static int get_newline_index(long value) {
+        long temp = value ^ 0x0A0A0A0A0A0A0A0AL;
+        return get_byte_0_index(temp);
+    }
+
+    private static int get_semicolon_index(long value) {
+        long temp = value ^ 0x3B3B3B3B3B3B3B3BL;
+        return get_byte_0_index(temp);
+    }
+
+    private static final boolean SINGLE_CORE = false;
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        FileChannel file_channel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ);
+        long file_size = file_channel.size();
+        long base_addr = file_channel.map(FileChannel.MapMode.READ_ONLY, 0, file_size, Arena.global()).address();
+
+        if (!SINGLE_CORE) {
+            int num_threads = AVAIL_CORES;
+            Thread[] threads = new Thread[num_threads];
+            for (int i = 0; i < num_threads; i++) {
+                int finalI = i;
+                threads[i] = new Thread(() -> {
+                    long slice_size = file_size / AVAIL_CORES;
+                    compute_slice(base_addr, slice_size, file_size, finalI);
+                });
+                threads[i].start();
+            }
+
+            TreeMap<String, Result> result_map = new TreeMap<>();
+            for (int i = 0; i < num_threads; i++) {
+                threads[i].join();
+                tables[i].update_res(result_map);
+            }
+
+            System.out.println(result_map);
+        }
+        else {
+            for (int i = 0; i < AVAIL_CORES; i++) {
+                int finalI = i;
+                long slice_size = file_size / AVAIL_CORES;
+                compute_slice(base_addr, slice_size, file_size, finalI);
+            }
+
+            TreeMap<String, Result> result_map = new TreeMap<>();
+            tables[0].update_res(result_map);
+
+            System.out.println(result_map);
+        }
+    }
+}

From 489ec9e3b1bc0e26bbbef7135d40ccfb1ce05f02 Mon Sep 17 00:00:00 2001
From: Roy van Rijn <roy.van.rijn@gmail.com>
Date: Sat, 27 Jan 2024 06:24:06 -0800
Subject: [PATCH 145/268] Larger heap, small tweaks (#593)

More small tweaks, perf from 775~ to 738~
---
 prepare_royvanrijn.sh                         |  4 +-
 .../onebrc/CalculateAverage_royvanrijn.java   | 84 +++++++++++--------
 2 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/prepare_royvanrijn.sh b/prepare_royvanrijn.sh
index a9789d6c0..81672e8f9 100755
--- a/prepare_royvanrijn.sh
+++ b/prepare_royvanrijn.sh
@@ -21,8 +21,8 @@ sdk use java 21.0.2-graal 1>&2
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_royvanrijn_image ]; then
 
-    JAVA_OPTS="--enable-preview -dsa"
-    NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_royvanrijn --gc=epsilon -Ob -O3 -march=native --strict-image-heap $JAVA_OPTS"
+    JAVA_OPTS="--enable-preview"
+    NATIVE_IMAGE_OPTS="-H:+UnlockExperimentalVMOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_royvanrijn --gc=epsilon -O3 -march=native -R:MaxHeapSize=515m -H:-GenLoopSafepoints -H:InlineAllBonus=10 -H:-ParseRuntimeOptions $JAVA_OPTS"
 
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_royvanrijn_image dev.morling.onebrc.CalculateAverage_royvanrijn
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
index 1cd70e42c..14c40e2e2 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java
@@ -62,8 +62,9 @@
  * Unrolling scan-loop:              1200 ms (seems to help, perhaps even more on target machine)
  * Adding more readable reader:      1300 ms (scores got worse on target machine anyway)
  *
- * Using old x86 MacBook and perf:   3500 ms (different scoring)
+ * Using old x86 MacBook and perf:   3500 ms (different machine for testing)
  * Decided to rewrite loop for 16 b: 3050 ms
+ * Small changes, limited heap:      2950 ms
  *
  * I have some instructions that could be removed, but faster with...
  *
@@ -201,6 +202,17 @@ private static byte[] fillEntry(final byte[] entry, final long fromAddress, fina
         return entry;
     }
 
+    private static byte[] fillEntry16(final byte[] entry, final int entryLength, final int temp, final long readBuffer1, final long readBuffer2) {
+        UNSAFE.putLong(entry, ENTRY_SUM, temp);
+        UNSAFE.putInt(entry, ENTRY_MIN, temp);
+        UNSAFE.putInt(entry, ENTRY_MAX, temp);
+        UNSAFE.putInt(entry, ENTRY_COUNT, 1);
+        UNSAFE.putByte(entry, ENTRY_LENGTH, (byte) entryLength);
+        UNSAFE.putLong(entry, ENTRY_NAME + entryLength - 16, readBuffer1);
+        UNSAFE.putLong(entry, ENTRY_NAME + entryLength - 8, readBuffer2);
+        return entry;
+    }
+
     public static void updateEntry(final byte[] entry, final int temp) {
 
         int entryMin = UNSAFE.getInt(entry, ENTRY_MIN);
@@ -326,51 +338,53 @@ private boolean hasNext() {
 
         private boolean readNext() {
 
-            readBuffer1 = UNSAFE.getLong(ptr);
-            readBuffer2 = UNSAFE.getLong(ptr + 8);
+            long lastRead = UNSAFE.getLong(ptr);
 
             entryLength += 16;
 
             // Find delimiter and create mask for long1
-            long comparisonResult1 = (readBuffer1 ^ DELIMITER_MASK);
+            long comparisonResult1 = (lastRead ^ DELIMITER_MASK);
             long highBitMask1 = (comparisonResult1 - 0x0101010101010101L) & (~comparisonResult1 & 0x8080808080808080L);
 
             boolean noContent1 = highBitMask1 == 0;
             long mask1 = noContent1 ? 0 : ~((highBitMask1 >>> 7) - 1);
-            int position1 = noContent1 ? -1 : Long.numberOfTrailingZeros(highBitMask1) >> 3;
+            int position1 = noContent1 ? 0 : 1 + (Long.numberOfTrailingZeros(highBitMask1) >> 3);
 
-            readBuffer1 &= ~mask1;
+            readBuffer1 = lastRead & ~mask1;
             hash ^= readBuffer1;
 
-            if (position1 != -1) {
+            int delimiter1 = position1 == 0 ? 0 : position1; // not nnecessary, but faster?
+
+            if (delimiter1 != 0) {
                 hash ^= hash >> 32;
                 readBuffer2 = 0;
-                ptr += position1 + 1;
+                ptr += delimiter1;
                 return false;
             }
 
+            lastRead = UNSAFE.getLong(ptr + 8);
+
             // Repeat for long2
-            long comparisonResult2 = (readBuffer2 ^ DELIMITER_MASK);
+            long comparisonResult2 = (lastRead ^ DELIMITER_MASK);
             long highBitMask2 = (comparisonResult2 - 0x0101010101010101L) & (~comparisonResult2 & 0x8080808080808080L);
             boolean noContent2 = highBitMask2 == 0;
-            long mask2 = noContent2 ? -1 : ((highBitMask2 >>> 7) - 1);
-            int position2 = noContent2 ? -1 : Long.numberOfTrailingZeros(highBitMask2) >> 3;
+            long mask2 = noContent2 ? 0 : ~((highBitMask2 >>> 7) - 1);
+            int position2 = noContent2 ? 0 : 1 + (Long.numberOfTrailingZeros(highBitMask2) >> 3);
 
-            mask2 = ~mask2; // also not necessary, but faster with?
             // Apply masks
-            readBuffer2 &= ~mask2;
+            readBuffer2 = lastRead & ~mask2;
             hash ^= readBuffer2;
 
-            int delimiter = position2 == -1 ? -1 : position2 + 8; // not nnecessary, but faster?
+            int delimiter2 = position2 == 0 ? 0 : position2 + 8; // not necessary, but faster?
 
             hash ^= hash >> 32;
 
-            if (delimiter == -1) {
-                ptr += 16;
-                return true;
+            if (delimiter2 != 0) {
+                ptr += delimiter2;
+                return false;
             }
-            ptr += delimiter + 1;
-            return false;
+            ptr += 16;
+            return true;
         }
 
         private int processEndAndGetTemperature() {
@@ -388,20 +402,21 @@ private void finalizeHash() {
         // Awesome idea of merykitty:
         private int readTemperature() {
             // This is the number part: X.X, -X.X, XX.x or -XX.X
-            long numberBytes = UNSAFE.getLong(ptr);
-            long invNumberBytes = ~numberBytes;
-
-            int dotPosition = Long.numberOfTrailingZeros(invNumberBytes & DOT_BITS);
+            final long numberBytes = UNSAFE.getLong(ptr);
+            final long invNumberBytes = ~numberBytes;
 
-            // Update the pointer here, bit awkward, but we have all the data
-            ptr += (dotPosition >> 3) + 3;
+            final int dotPosition = Long.numberOfTrailingZeros(invNumberBytes & DOT_BITS);
 
-            int min28 = (28 - dotPosition);
             // Calculates the sign
             final long signed = (invNumberBytes << 59) >> 63;
+            final int min28 = (dotPosition ^ 0b11100);
             final long minusFilter = ~(signed & 0xFF);
             // Use the pre-calculated decimal position to adjust the values
-            long digits = ((numberBytes & minusFilter) << min28) & 0x0F000F0F00L;
+            final long digits = ((numberBytes & minusFilter) << min28) & 0x0F000F0F00L;
+
+            // Update the pointer here, bit awkward, but we have all the data
+            ptr += (dotPosition >> 3) + 3;
+
             // Multiply by a magic (100 * 0x1000000 + 10 * 0x10000 + 1), to get the result
             final long absValue = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
             // And perform abs()
@@ -415,10 +430,6 @@ private boolean matches(final byte[] entry) {
                     return false;
                 }
                 step += 8;
-                if (compare(null, entryStart + step, entry, ENTRY_NAME + step)) {
-                    return false;
-                }
-                step += 8;
             }
             if (compare(readBuffer1, entry, ENTRY_NAME + step)) {
                 return false;
@@ -431,8 +442,13 @@ private boolean matches(final byte[] entry) {
         }
 
         private boolean matches16(final byte[] entry) {
-            return !compare(readBuffer1, entry, ENTRY_NAME) &&
-                    !compare(readBuffer2, entry, ENTRY_NAME + 8);
+            if (compare(readBuffer1, entry, ENTRY_NAME)) {
+                return false;
+            }
+            if (compare(readBuffer2, entry, ENTRY_NAME + 8)) {
+                return false;
+            }
+            return true;
         }
     }
 
@@ -463,7 +479,7 @@ private static byte[][] processMemoryArea(final long startAddress, final long en
                     if (entry == null) {
                         byte[] entryBytes = (entryCount < PREMADE_ENTRIES) ? preConstructedEntries[entryCount++]
                                 : new byte[ENTRY_BASESIZE_WHITESPACE + 16]; // with enough room
-                        table[index] = fillEntry(entryBytes, reader.entryStart, 16, temperature, reader.readBuffer1, reader.readBuffer2);
+                        table[index] = fillEntry16(entryBytes, 16, temperature, reader.readBuffer1, reader.readBuffer2);
                         break;
                     }
                     else if (reader.matches16(entry)) {

From 5c47ce1cbd3350635ef5b63b0745337f79481d79 Mon Sep 17 00:00:00 2001
From: Manish Garg <manishgarg08500@gmail.com>
Date: Sat, 27 Jan 2024 20:22:11 +0530
Subject: [PATCH 146/268] Reading 1B row file using Java NIO lib. (#601)

---
 calculate_average_manishgarg90.sh             |  19 ++
 prepare_manishgarg90.sh                       |  20 +++
 .../onebrc/CalculateAverage_manishgarg90.java | 169 ++++++++++++++++++
 3 files changed, 208 insertions(+)
 create mode 100755 calculate_average_manishgarg90.sh
 create mode 100755 prepare_manishgarg90.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_manishgarg90.java

diff --git a/calculate_average_manishgarg90.sh b/calculate_average_manishgarg90.sh
new file mode 100755
index 000000000..93c6a3794
--- /dev/null
+++ b/calculate_average_manishgarg90.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_manishgarg90
diff --git a/prepare_manishgarg90.sh b/prepare_manishgarg90.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_manishgarg90.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_manishgarg90.java b/src/main/java/dev/morling/onebrc/CalculateAverage_manishgarg90.java
new file mode 100644
index 000000000..11cad07ff
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_manishgarg90.java
@@ -0,0 +1,169 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class CalculateAverage_manishgarg90 {
+
+    private static final String FILE = "./measurements.txt";
+    private static int nProcessors = Runtime.getRuntime().availableProcessors();
+
+    public static void main(String[] args) throws IOException {
+        try (FileChannel channel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ)) {
+            long fileSize = channel.size();
+            long chunkSize = (fileSize + nProcessors - 1) / nProcessors;
+            long pos = 0;
+
+            List<MappedByteBuffer> buffers = new ArrayList<>(nProcessors);
+
+            for (int i = 0; i < nProcessors; i++) {
+                long endPosition = getEndPosition(channel, pos + chunkSize);
+                long size = endPosition - pos;
+                MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, pos, size);
+                pos = pos + size;
+                buffers.add(buffer);
+            }
+
+            Map<String, Stat> s = readBufferAndCalculateMeauremenst(buffers);
+            Map<String, Stat> tm = new TreeMap<String, Stat>(s);
+            System.out.println(tm);
+        }
+        catch (IOException e) {
+            e.printStackTrace();
+        }
+
+    }
+
+    private static Map<String, Stat> readBufferAndCalculateMeauremenst(List<MappedByteBuffer> chunks) {
+        return chunks.parallelStream().map(buffer -> {
+            Map<String, Stat> map = new HashMap<>(10_000, 1);
+            int lineStart = 0;
+            int doubleStart = 0;
+            int length = buffer.limit();
+            String station = null;
+            for (int i = 0; i < length; ++i) {
+                byte b = buffer.get(i);
+                if (b == ';') {
+                    byte[] stationBuffer = new byte[i - lineStart];
+                    buffer.position(lineStart);
+                    buffer.get(stationBuffer);
+                    station = new String(stationBuffer, StandardCharsets.UTF_8);
+                    doubleStart = i + 1;
+                }
+                else if (b == '\n') {
+                    byte[] doubleBuffer = new byte[i - doubleStart];
+                    buffer.position(doubleStart);
+                    buffer.get(doubleBuffer);
+                    Double temperature = Double.parseDouble(new String(doubleBuffer));
+                    lineStart = i + 1;
+
+                    // I have station name and temp
+                    Stat s = map.get(station);
+                    if (s == null) {
+                        map.put(station, new Stat(temperature));
+                    }
+                    else {
+                        s.update(temperature);
+                    }
+                }
+            }
+            return map;
+        }).reduce(new HashMap<>(), (map1, map2) -> {
+            Stat s = new Stat();
+            s.merge(map1);
+            s.merge(map2);
+            return s.getResultMap();
+        });
+
+    }
+
+    private static long getEndPosition(FileChannel channel, long position) throws IOException {
+        ByteBuffer buffer = ByteBuffer.allocate(1);
+        while (position < channel.size()) {
+            channel.read(buffer, position);
+
+            if (buffer.get(0) == '\n') {
+                return position + 1;
+            }
+            position++;
+            buffer.clear();
+        }
+        return channel.size();
+    }
+
+    private static final class Stat {
+
+        private Double min = Double.MAX_VALUE;
+        private Double max = Double.MIN_VALUE;
+        private Double sum = 0d;
+        private long count = 0L;
+
+        private Map<String, Stat> resultMap = null;
+
+        public Stat() {
+            this.resultMap = new HashMap<>(10_000, 1);
+        }
+
+        public Stat(Double value) {
+            this.min = value;
+            this.max = value;
+            this.sum += value;
+            this.count++;
+        }
+
+        private void update(Double value) {
+            this.min = Math.min(this.min, value);
+            this.max = Math.max(this.max, value);
+            this.sum = round(this.sum + value);
+            this.count++;
+        }
+
+        private void merge(Map<String, Stat> result) {
+            result.forEach((city, resultRow) -> resultMap.merge(city, resultRow, (existing, incoming) -> {
+                existing.min = Math.min(existing.min, incoming.min);
+                existing.max = Math.max(existing.max, incoming.max);
+                existing.sum += incoming.sum;
+                existing.count += incoming.count;
+                return existing;
+            }));
+        }
+
+        public Map<String, Stat> getResultMap() {
+            return resultMap;
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        @Override
+        public String toString() {
+            return round(min) + "/" + round(sum / count) + "/" + round(max);
+        }
+    }
+}

From f1209f2ba8e286474f08762f9e4f161981e39cee Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sat, 27 Jan 2024 16:10:47 +0100
Subject: [PATCH 147/268] Leaderboard update

---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e43beb96c..1fe35e151 100644
--- a/README.md
+++ b/README.md
@@ -42,16 +42,16 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-|   | 00:02.169 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.196 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.146 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 3 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 |   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
-|   | 00:03.510 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
+|   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
@@ -62,6 +62,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
+|   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
@@ -81,6 +82,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.635 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java)| 21.0.1-graal | [Laake Scates-Gervasi](https://github.com/justplainlaake) | GraalVM native binary, uses Unsafe |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
+|   | 00:06.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java)| 21.0.1-graal | [rcasteltrione](https://github.com/rcasteltrione) |  |
 |   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
 |   | 00:07.563 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java)| 21.0.1-graal | [3j5a](https://github.com/3j5a) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
@@ -154,12 +156,14 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |
 |   | 00:35.875 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java)| 21.0.1-open | [MahmoudFawzyKhalil](https://github.com/MahmoudFawzyKhalil) |  |
 |   | 00:36.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hchiorean.java)| 21.0.1-open | [Horia Chiorean](https://github.com/hchiorean) |  |
+|   | 00:36.424 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_manishgarg90.java)| java | [Manish Garg](https://github.com/manishgarg90) |  |
 |   | 00:38.340 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_AbstractKamen.java)| 21.0.1-open | [AbstractKamen](https://github.com/AbstractKamen) |  |
 |   | 00:41.982 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_criccomini.java)| 21.0.1-open | [Chris Riccomini](https://github.com/criccomini) |  |
 |   | 00:42.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_javamak.java)| 21.0.1-open | [javamak](https://github.com/javamak) |  |
 |   | 00:46.597 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_maeda6uiui.java)| 21.0.1-open | [Maeda-san](https://github.com/maeda6uiui) |  |
 |   | 00:58.811 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Ujjwalbharti.java)| 21.0.1-open | [Ujjwal Bharti](https://github.com/Ujjwalbharti) |  |
 |   | 01:05.094 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mudit-saxena.java)| 21.0.1-open | [Mudit Saxena](https://github.com/mudit-saxena) |  |
+|   | 01:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dqhieuu.java)| 21.0.1-graal | [Hieu Dao Quang](https://github.com/dqhieuu) |  |
 |   | 01:06.790 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_khmarbaise.java)| 21.0.1-open | [Karl Heinz Marbaise](https://github.com/khmarbaise) |  |
 |   | 01:06.944 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_santanu.java)| 21.0.1-open | [santanu](https://github.com/santanu) |  |
 |   | 01:07.014 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_pedestrianlove.java)| 21.0.1-open | [pedestrianlove](https://github.com/pedestrianlove) |  |

From 6b5b68c77216b9744c37afbe3c2c092023c17f56 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sat, 27 Jan 2024 18:20:13 +0100
Subject: [PATCH 148/268] Leaderboard update

---
 README.md | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 1fe35e151..34f7a6cff 100644
--- a/README.md
+++ b/README.md
@@ -229,27 +229,34 @@ Here are the results from running the top 15 entries (as of commit [2c26b511](ht
 #### 10K Key Set
 
 The 1BRC challenge data set contains 413 distinct weather stations, whereas the rules allow for 10,000 different station names to occur.
-Here are the results from running the top 15 entries (as of commit [2c26b511](https://github.com/gunnarmorling/1brc/commit/2c26b511e741f4d96a51dda831001946ea27a591)) against 1,000,000,000 measurement values across 10K stations (created via _./create_measurements3.sh 1000000000_),
+Here are the results from running the top 15 entries (as of commit [f1209f2b](https://github.com/gunnarmorling/1brc/commit/f1209f2ba8e286474f08762f9e4f161981e39cee), Jan 27) against 1,000,000,000 measurement values across 10K stations (created via _./create_measurements3.sh 1000000000_),
 using eight cores on the evaluation machine:
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:04.589 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
-| 2 | 00:05.296 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
-| 3 | 00:05.308 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue) | GraalVM native binary |
-|   | 00:05.881 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
-|   | 00:07.120 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
-|   | 00:07.915 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
-|   | 00:08.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
-|   | 00:10.052 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [merykittyunsafe](https://github.com/merykittyunsafe) |  |
-|   | 00:10.134 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
-|   | 00:10.599 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
-|   | 00:12.750 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
+| 1 | 00:02.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 2 | 00:04.001 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 3 | 00:04.516 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+|   | 00:04.848 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
+|   | 00:05.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
+|   | 00:05.614 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
+|   | 00:05.670 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:06.111 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
+|   | 00:06.929 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | uses Unsafe |
+|   | 00:09.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) | uses Unsafe |
+|   | 00:10.038 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [merykittyunsafe](https://github.com/merykittyunsafe) | uses Unsafe |
+|   | 00:10.197 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
+|   | 00:12.567 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
+|   | 00:12.602 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
+|   | 00:15.896 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
+|   | 00:18.064 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:20.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
+|   | 04:11.062 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | ---       | | | | |
-|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) | Didn't complete in 60 sec |
-|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) | Didn't complete in 60 sec |
+|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | Incorrect output |
 |   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | Didn't complete in 60 sec |
-|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | Failed with java.lang.OutOfMemoryError: Java heap space |
+|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | Seg fault |
+
 
 ## Prerequisites
 

From eea9c33858a668ffd05ec27a4565b9e1afdb5604 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jairo=20Grater=C3=B3n?=
 <58091322+jgrateron@users.noreply.github.com>
Date: Sat, 27 Jan 2024 14:32:15 -0400
Subject: [PATCH 149/268] Fix hash code collisions (#605)

* fix test rounding, pass 10K station names

* improved integer conversion, delayed string creation.

* new algorithm hash, use ConcurrentHashMap

* fix rounding test

* added the length of the string in the hash initialization.

* fix hash code collisions
---
 .../onebrc/CalculateAverage_jgrateron.java    | 156 +++++++++---------
 1 file changed, 77 insertions(+), 79 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java
index fa93167c2..f79fe7a30 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java
@@ -20,11 +20,12 @@
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.TreeMap;
 import java.util.stream.Collectors;
 
 public class CalculateAverage_jgrateron {
@@ -32,6 +33,8 @@ public class CalculateAverage_jgrateron {
     private static final int MAX_LENGTH_LINE = 255;
     private static final int MAX_BUFFER = 1024 * 8;
     private static boolean DEBUG = false;
+    public static int DECENAS[] = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 };
+    public static int CENTENAS[] = { 0, 100, 200, 300, 400, 500, 600, 700, 800, 900 };
 
     public record Particion(long offset, long size) {
     }
@@ -92,21 +95,25 @@ public static void main(String[] args) throws InterruptedException, IOException
         Locale.setDefault(Locale.US);
         var startTime = System.nanoTime();
         var archivo = new File(FILE);
-        var totalMediciones = new TreeMap<String, Medicion>();
         var tareas = new ArrayList<Thread>();
+        var totalMediciones = new HashMap<Index, Medicion>();
         var particiones = dividirArchivo(archivo);
 
         for (var p : particiones) {
             var hilo = Thread.ofVirtual().start(() -> {
                 try (var miTarea = new MiTarea(archivo, p)) {
                     var mediciones = miTarea.calcularMediciones();
-                    synchronized (totalMediciones) {
-                        for (var entry : mediciones.entrySet()) {
-                            var medicion = totalMediciones.get(entry.getKey());
+                    for (var entry : mediciones.entrySet()) {
+                        Medicion medicion;
+                        synchronized (totalMediciones) {
+                            medicion = totalMediciones.get(entry.getKey());
                             if (medicion == null) {
                                 totalMediciones.put(entry.getKey(), entry.getValue());
+                                medicion = entry.getValue();
                             }
-                            else {
+                        }
+                        synchronized (medicion) {
+                            if (!medicion.equals(entry.getValue())) {
                                 var otraMed = entry.getValue();
                                 medicion.update(otraMed.count, otraMed.tempMin, otraMed.tempMax, otraMed.tempSum);
                             }
@@ -119,12 +126,18 @@ public static void main(String[] args) throws InterruptedException, IOException
             });
             tareas.add(hilo);
         }
+
+        Comparator<Map.Entry<Index, Medicion>> comparar = (a, b) -> {
+            return a.getValue().getNombreEstacion().compareTo(b.getValue().getNombreEstacion());
+        };
+
         for (var hilo : tareas) {
             hilo.join();
         }
 
         var result = totalMediciones.entrySet().stream()//
-                .map(e -> e.getKey() + "=" + e.getValue().toString())//
+                .sorted(comparar)
+                .map(e -> e.getValue().toString())//
                 .collect(Collectors.joining(", "));
 
         System.out.println("{" + result + "}");
@@ -138,17 +151,38 @@ public static void main(String[] args) throws InterruptedException, IOException
      */
     static class Index {
         private int hash;
+        private byte[] data;
+        private int fromIndex;
+        private int length;
 
         public Index() {
             this.hash = 0;
         }
 
-        public Index(int hash) {
-            this.hash = hash;
+        public Index(byte data[], int fromIndex, int length) {
+            this.data = data;
+            this.fromIndex = fromIndex;
+            this.length = length;
+            this.hash = calcHashCode(length, data, fromIndex, length);
+        }
+
+        public void setData(byte data[], int fromIndex, int length) {
+            this.data = data;
+            this.fromIndex = fromIndex;
+            this.length = length;
+            this.hash = calcHashCode(length, data, fromIndex, length);
         }
 
-        public void setHash(int hash) {
-            this.hash = hash;
+        /*
+         * Calcula el hash de cada estacion,
+         * variation of Daniel J Bernstein's algorithm
+         */
+        private int calcHashCode(int result, byte[] a, int fromIndex, int length) {
+            int end = fromIndex + length;
+            for (int i = fromIndex; i < end; i++) {
+                result = ((result << 5) + result) ^ a[i];
+            }
+            return result;
         }
 
         @Override
@@ -162,7 +196,8 @@ public boolean equals(Object obj) {
                 return true;
             }
             var otro = (Index) obj;
-            return this.hash == otro.hash;
+            return Arrays.equals(this.data, this.fromIndex, this.fromIndex + this.length, otro.data, otro.fromIndex,
+                    otro.fromIndex + otro.length);
         }
     }
 
@@ -171,14 +206,12 @@ public boolean equals(Object obj) {
      * RandomAccessFile permite dezplazar el puntero de lectura del archivo
      * Tenemos un Map para guardar las estadisticas y un map para guardar los
      * nombres de las estaciones
-     * 
      */
     static class MiTarea implements AutoCloseable {
         private final RandomAccessFile rFile;
         private long maxRead;
         private Index index = new Index();
         private Map<Index, Medicion> mediciones = new HashMap<>();
-        private Map<Index, String> estaciones = new HashMap<>();
 
         public MiTarea(File file, Particion particion) throws IOException {
             rFile = new RandomAccessFile(file, "r");
@@ -197,7 +230,7 @@ public void close() throws IOException {
          * obtiene la posicion de separacion ";" de la estacion y su temperatura
          * calcula el hash, convierte a double y actualiza las estadisticas
          */
-        public Map<String, Medicion> calcularMediciones() throws IOException {
+        public Map<Index, Medicion> calcularMediciones() throws IOException {
             var buffer = new byte[MAX_BUFFER];// buffer para lectura en el archivo
             var rest = new byte[MAX_LENGTH_LINE];// Resto que sobra en cada lectura del buffer
             var lenRest = 0;// Longitud que sobró en cada lectura del buffer
@@ -211,17 +244,15 @@ public Map<String, Medicion> calcularMediciones() throws IOException {
                 if (numBytes == -1) {
                     break;
                 }
-                var totalLeidos = totalRead + numBytes;
-                if (totalLeidos > maxRead) {
-                    numBytes = maxRead - totalRead;
-                }
+                numBytes = totalRead + numBytes > maxRead ? maxRead - totalRead : numBytes;
                 totalRead += numBytes;
                 int pos = 0;
                 int len = 0;
                 int idx = 0;
                 int semicolon = 0;
                 while (pos < numBytes) {
-                    if (buffer[pos] == '\n' || buffer[pos] == '\r') {
+                    var b = buffer[pos];
+                    if (b == '\n' || b == '\r') {
                         if (lenRest > 0) {
                             // concatenamos el sobrante anterior con la nueva linea
                             System.arraycopy(buffer, idx, rest, lenRest, len);
@@ -238,7 +269,7 @@ public Map<String, Medicion> calcularMediciones() throws IOException {
                         semicolon = 0;
                     }
                     else {
-                        if (buffer[pos] == ';') {
+                        if (b == ';') {
                             semicolon = len;
                         }
                         len++;
@@ -250,7 +281,7 @@ public Map<String, Medicion> calcularMediciones() throws IOException {
                     lenRest = len;
                 }
             }
-            return transformMediciones();
+            return mediciones;
         }
 
         /*
@@ -270,19 +301,14 @@ public int buscarSemicolon(byte data[], int len) {
          * Busca una medicion por su hash y crea o actualiza la temperatura
          */
         public void updateMediciones(byte data[], int pos, int semicolon) {
-            var hashEstacion = calcHashCode(0, data, pos, semicolon);
             var temp = strToInt(data, pos, semicolon);
-            index.setHash(hashEstacion);
-            var estacion = estaciones.get(index);
-            if (estacion == null) {
-                estacion = new String(data, pos, semicolon);
-                estaciones.put(new Index(hashEstacion), estacion);
-            }
-            index.setHash(hashEstacion);
+            index.setData(data, pos, semicolon);
             var medicion = mediciones.get(index);
             if (medicion == null) {
-                medicion = new Medicion(1, temp, temp, temp);
-                mediciones.put(new Index(hashEstacion), medicion);
+                var estacion = new byte[semicolon];
+                System.arraycopy(data, pos, estacion, 0, semicolon);
+                medicion = new Medicion(estacion, 1, temp, temp, temp);
+                mediciones.put(new Index(estacion, 0, semicolon), medicion);
             }
             else {
                 medicion.update(1, temp, temp, temp);
@@ -290,50 +316,15 @@ public void updateMediciones(byte data[], int pos, int semicolon) {
         }
 
         /*
-         * Convierte las estaciones de hash a string
-         */
-        private Map<String, Medicion> transformMediciones() {
-            var newMediciones = new HashMap<String, Medicion>();
-            for (var e : mediciones.entrySet()) {
-                var estacion = estaciones.get(e.getKey());
-                var medicion = e.getValue();
-                newMediciones.put(estacion, medicion);
-            }
-            return newMediciones;
-        }
-
-        /*
-         * Calcula el hash de cada estacion, esto es una copia de java.internal.hashcode
+         * convierte de un arreglo de bytes a integer
          */
-        private int calcHashCode(int result, byte[] a, int fromIndex, int length) {
-            int end = fromIndex + length;
-            for (int i = fromIndex; i < end; i++) {
-                result = 31 * result + a[i];
-            }
-            return result;
-        }
 
-        /*
-         * convierte de un arreglo de bytes a double
-         */
         public int strToInt(byte linea[], int idx, int posSeparator) {
-            int number = 0;
             int pos = idx + posSeparator + 1;
             boolean esNegativo = linea[pos] == '-';
-            if (esNegativo) {
-                pos++;
-            }
-            int digit1 = linea[pos] - 48;
-            pos++;
-            if (linea[pos] == '.') {
-                pos++;
-                number = (digit1 * 10) + (linea[pos] - 48);
-            }
-            else {
-                int digit2 = linea[pos] - 48;
-                pos += 2;
-                number = (digit1 * 100) + (digit2 * 10) + (linea[pos] - 48);
-            }
+            pos = esNegativo ? pos + 1 : pos;
+            int number = linea[pos + 1] == '.' ? DECENAS[(linea[pos] - 48)] + linea[pos + 2] - 48
+                    : CENTENAS[(linea[pos] - 48)] + DECENAS[(linea[pos + 1] - 48)] + (linea[pos + 3] - 48);
             return esNegativo ? -number : number;
         }
     }
@@ -346,9 +337,12 @@ static class Medicion {
         private int tempMin;
         private int tempMax;
         private int tempSum;
+        private byte estacion[];
+        private String nombreEstacion;
 
-        public Medicion(int count, int tempMin, int tempMax, int tempSum) {
+        public Medicion(byte estacion[], int count, int tempMin, int tempMax, int tempSum) {
             super();
+            this.estacion = estacion;
             this.count = count;
             this.tempMin = tempMin;
             this.tempMax = tempMax;
@@ -357,12 +351,8 @@ public Medicion(int count, int tempMin, int tempMax, int tempSum) {
 
         public void update(int count, int tempMin, int tempMax, int tempSum) {
             this.count += count;
-            if (tempMin < this.tempMin) {
-                this.tempMin = tempMin;
-            }
-            if (tempMax > this.tempMax) {
-                this.tempMax = tempMax;
-            }
+            this.tempMin = Math.min(tempMin, this.tempMin);
+            this.tempMax = Math.max(tempMax, this.tempMax);
             this.tempSum += tempSum;
         }
 
@@ -370,12 +360,20 @@ public double round(double number) {
             return Math.round(number) / 10.0;
         }
 
+        public String getNombreEstacion() {
+            if (nombreEstacion == null) {
+                nombreEstacion = new String(estacion);
+            }
+            return nombreEstacion;
+        }
+
         @Override
         public String toString() {
             var min = round(tempMin);
             var mid = round(1.0 * tempSum / count);
             var max = round(tempMax);
-            return "%.1f/%.1f/%.1f".formatted(min, mid, max);
+            var nombre = getNombreEstacion();
+            return "%s=%.1f/%.1f/%.1f".formatted(nombre, min, mid, max);
         }
     }
 }

From a304f80710940fca3b283e9816b92c46986871d0 Mon Sep 17 00:00:00 2001
From: Yevhenii Melnyk <melnyk.yevhenii@gmail.com>
Date: Sat, 27 Jan 2024 19:37:19 +0100
Subject: [PATCH 150/268] (new submission) melgenek: ~top 15 on 10k. Buffered
 IO, VarHandles, vectors, custom hashtable (#600)

* melgenek: ~top 15 on 10k. Buffered IO, VarHandles, vectors, custom hashtable

* Calculate the required heap size dynamically
---
 calculate_average_melgenek.sh                 |  37 ++
 prepare_melgenek.sh                           |  19 +
 .../onebrc/CalculateAverage_melgenek.java     | 549 ++++++++++++++++++
 3 files changed, 605 insertions(+)
 create mode 100755 calculate_average_melgenek.sh
 create mode 100755 prepare_melgenek.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java

diff --git a/calculate_average_melgenek.sh b/calculate_average_melgenek.sh
new file mode 100755
index 000000000..e0a88a352
--- /dev/null
+++ b/calculate_average_melgenek.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --add-modules jdk.incubator.vector -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0"
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+AlwaysPreTouch"
+# These flags are mostly copied from the shipilev's branch. They don't really give a predictable benefit, but they don't hurt either.
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:CICompilerCount=1 -XX:CompileThreshold=2048 -XX:-UseCountedLoopSafepoints -XX:+TrustFinalNonStaticFields"
+
+if [[ "$(uname -s)" == "Linux" ]]; then
+    JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+fi
+
+# https://stackoverflow.com/a/23378780/7221823
+logicalCpuCount=$([ $(uname) = 'Darwin' ] &&
+                       sysctl -n hw.logicalcpu_max ||
+                       lscpu -p | egrep -v '^#' | wc -l)
+# The required heap is proportional to the number of cores.
+# There's roughly 3.5MB heap per thread required for the 10k problem.
+requiredMemory=$(echo "(l(15 + 3.5 * $logicalCpuCount)/l(2))" | bc -l)
+heapSize=$(echo "scale=0; 2^(($requiredMemory+1)/1)" | bc)
+
+JAVA_OPTS="$JAVA_OPTS -Xms${heapSize}m -Xmx${heapSize}m"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_melgenek
diff --git a/prepare_melgenek.sh b/prepare_melgenek.sh
new file mode 100755
index 000000000..09c53f634
--- /dev/null
+++ b/prepare_melgenek.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-open 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java b/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java
new file mode 100644
index 000000000..133573186
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java
@@ -0,0 +1,549 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.Vector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.VarHandle;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.TreeMap;
+import java.util.concurrent.Executors;
+
+/**
+ * The implementation:
+ * - reads a file with buffered IO
+ * - uses VarHandles to get longs/ints from a byte array
+ * - delimiter search is vectorized
+ * - there is a custom hash function, that provides a low collision rate and short probe distances in hash tables
+ * - has 2 custom open addressing hash tables: one for strings <=8 bytes in length, and one more for strings of any length
+ */
+public class CalculateAverage_melgenek {
+
+    private static final VarHandle LONG_VIEW = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.nativeOrder()).withInvokeExactBehavior();
+    private static final VarHandle INT_VIEW = MethodHandles.byteArrayViewVarHandle(int[].class, ByteOrder.nativeOrder()).withInvokeExactBehavior();
+    private static final int CORES_COUNT = Runtime.getRuntime().availableProcessors();
+
+    private static final String FILE = "./measurements.txt";
+
+    /**
+     * This is a prime number that gives pretty
+     * <a href="https://vanilla-java.github.io/2018/08/15/Looking-at-randomness-and-performance-for-hash-codes.html">good hash distributions</a>
+     * on the data in this challenge.
+     */
+    private static final long RANDOM_PRIME = 0x7A646E4D;
+    private static final int ZERO_CHAR_3_SUM = 100 * '0' + 10 * '0' + '0';
+    private static final int ZERO_CHAR_2_SUM = 10 * '0' + '0';
+    private static final byte NEWLINE = '\n';
+    private static final byte SEMICOLON = ';';
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
+    private static final int BYTE_SPECIES_BYTE_SIZE = BYTE_SPECIES.vectorByteSize();
+    private static final Vector<Byte> NEWLINE_VECTOR = BYTE_SPECIES.broadcast(NEWLINE);
+    private static final Vector<Byte> SEMICOLON_VECTOR = BYTE_SPECIES.broadcast(SEMICOLON);
+    private static final int MAX_LINE_LENGTH = 107; // 100 + len(";-11.1\n") = 100+7
+    private static final TreeMap<String, ResultRow> RESULT = new TreeMap<>();
+
+    public static void main(String[] args) throws Throwable {
+        long totalSize = Files.size(Path.of(FILE));
+        try (var executor = Executors.newFixedThreadPool(CORES_COUNT - 1)) {
+            long chunkSize = Math.max(1, totalSize / CORES_COUNT);
+            long offset = 0;
+            int i = 0;
+            for (; offset < totalSize && i < CORES_COUNT - 1; i++) {
+                long currentOffset = offset;
+                long maxOffset = Math.min((i + 1) * chunkSize, totalSize);
+                executor.submit(() -> processRange(currentOffset, maxOffset));
+                offset = (i + 1) * chunkSize - 1;
+            }
+            if (offset < totalSize) {
+                processRange(offset, totalSize);
+            }
+        }
+        System.out.println(RESULT);
+    }
+
+    private static void processRange(long startOffset, long maxOffset) {
+        final var table = new CompositeTable();
+        try (var file = new BufferedFile(startOffset, maxOffset)) {
+            processChunk(file, table);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+        synchronized (RESULT) {
+            table.addRows(RESULT);
+        }
+    }
+
+    private static void processChunk(BufferedFile file, CompositeTable table) {
+        if (file.offset != 0) {
+            file.refillBuffer();
+            int newlinePosition = findDelimiter(file, 0, NEWLINE_VECTOR, NEWLINE);
+            file.bufferPosition = newlinePosition + 1;
+            file.offset += file.bufferPosition;
+        }
+        while (file.offset < file.maxOffset) {
+            file.refillBuffer();
+            int bytesProcessed = processOneRow(file, table);
+            file.offset += bytesProcessed;
+        }
+    }
+
+    private static int processOneRow(BufferedFile file, CompositeTable table) {
+        int stringStart = file.bufferPosition;
+        int stringEnd = findDelimiter(file, stringStart, SEMICOLON_VECTOR, SEMICOLON);
+
+        file.bufferPosition = stringEnd + 1;
+        short value = parseValue(file);
+
+        table.add(file.buffer, stringStart, stringEnd, value);
+
+        return file.bufferPosition - stringStart;
+    }
+
+    private static short parseValue(BufferedFile file) {
+        byte firstDigit = file.buffer[file.bufferPosition];
+        int sign = 1;
+        if (firstDigit == '-') {
+            sign = -1;
+            file.bufferPosition++;
+            firstDigit = file.buffer[file.bufferPosition];
+        }
+
+        byte secondDigit = file.buffer[file.bufferPosition + 1];
+        int result;
+        if (secondDigit == '.') {
+            result = firstDigit * 10 + file.buffer[file.bufferPosition + 2] - ZERO_CHAR_2_SUM;
+            file.bufferPosition += 4;
+        }
+        else {
+            result = firstDigit * 100 + secondDigit * 10 + file.buffer[file.bufferPosition + 3] - ZERO_CHAR_3_SUM;
+            file.bufferPosition += 5;
+        }
+        return (short) (result * sign);
+    }
+
+    /**
+     * <a href="https://gms.tf/stdfind-and-memchr-optimizations.html#do-more-faster">Finds a delimiter in a byte array using vectorized comparisons.</a>
+     */
+    private static int findDelimiter(BufferedFile file, int startPosition, Vector<Byte> repeatedDelimiter, byte delimiter) {
+        int position = startPosition;
+        int vectorLoopBound = startPosition + BYTE_SPECIES.loopBound(file.bufferLimit - startPosition);
+        for (; position < vectorLoopBound; position += BYTE_SPECIES_BYTE_SIZE) {
+            var vector = ByteVector.fromArray(BYTE_SPECIES, file.buffer, position);
+            var comparisonResult = vector.compare(VectorOperators.EQ, repeatedDelimiter);
+            if (comparisonResult.anyTrue()) {
+                return position + comparisonResult.firstTrue();
+            }
+        }
+
+        while (file.buffer[position] != delimiter) {
+            position++;
+        }
+
+        return position;
+    }
+
+    private static long keepLastBytes(long value, int numBytesToKeep) {
+        // Number of bits to shift, so that the mask covers only `numBytesToKeep` least significant bits
+        int bitShift = (Long.BYTES - numBytesToKeep) * Byte.SIZE;
+        // Mask with the specified number of the least significant bits set to 1
+        long mask = -1L >>> bitShift;
+        return value & mask;
+    }
+
+    /**
+     * The function transforms a string with the length <=8 bytes to a java String.
+     * The function assumes that the string is 0 terminated.
+     */
+    private static String longToString(long value) {
+        int strLength = Long.BYTES - Long.numberOfLeadingZeros(value) / Byte.SIZE;
+        var bytes = new byte[strLength];
+        for (int i = 0; i < strLength; i++) {
+            bytes[i] = (byte) (value >> (i * Byte.SIZE));
+        }
+        return new String(bytes, StandardCharsets.UTF_8);
+    }
+
+    /**
+     * Store measurements based on string lengths.
+     * Stores strings of length <= 8 and other strings separately.
+     * This table is a simplified implementation of strings hash table in <a href="https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/StringHashMap.h">ClickHouse</a>.
+     * The original parer that describes benefits of the approach is <a href="https://www.mdpi.com/2076-3417/10/6/1915">SAHA: A String Adaptive Hash Table for Analytical Databases</a>.
+     */
+    private static final class CompositeTable {
+        private final LongTable longTable = new LongTable();
+        private final RegularTable regularTable = new RegularTable();
+
+        private void add(byte[] buffer, int stringStart, int stringEnd, short value) {
+            int stringLength = stringEnd - stringStart;
+            if (stringLength <= Long.BYTES) {
+                long str = keepLastBytes((long) LONG_VIEW.get(buffer, stringStart), stringLength);
+                this.longTable.add(str, value);
+            }
+            else {
+                int hash = calculateHash(buffer, stringStart, stringEnd);
+                this.regularTable.add(buffer, stringStart, stringLength, hash, value);
+            }
+        }
+
+        public void addRows(TreeMap<String, ResultRow> result) {
+            this.longTable.addRows(result);
+            this.regularTable.addRows(result);
+        }
+    }
+
+    /**
+     * The hash calculation is inspired by
+     * <a href="https://questdb.io/blog/building-faster-hash-table-high-performance-sql-joins/#fastmap-internals">QuestDB FastMap</a>
+     */
+    private static int calculateHash(byte[] buffer, int startPosition, int endPosition) {
+        long hash = 0;
+
+        int position = startPosition;
+        for (; position + Long.BYTES <= endPosition; position += Long.BYTES) {
+            long value = (long) LONG_VIEW.get(buffer, position);
+            hash = hash * RANDOM_PRIME + value;
+        }
+
+        if (position + Integer.BYTES <= endPosition) {
+            int value = (int) INT_VIEW.get(buffer, position);
+            hash = hash * RANDOM_PRIME + value;
+            position += Integer.BYTES;
+        }
+
+        for (; position <= endPosition; position++) {
+            hash = hash * RANDOM_PRIME + buffer[position];
+        }
+        hash = hash * RANDOM_PRIME;
+        return (int) hash ^ (int) (hash >>> 32);
+    }
+
+    private static int calculateLongHash(long str) {
+        long hash = str * RANDOM_PRIME;
+        return (int) hash ^ (int) (hash >>> 32);
+    }
+
+    /**
+     * This tables stores strings of length <= 8 bytes.
+     * Does not store hashes.
+     */
+    private static final class LongTable {
+        private static final int TABLE_CAPACITY = 32768;
+        private static final int TABLE_CAPACITY_MASK = TABLE_CAPACITY - 1;
+        /**
+         * The buckets use 3 longs to store strings and measurements:
+         * long 1) station name
+         * long 2) sum of measurements
+         * long 3) count (int) | min (short) | max (short) <-- packed into one long
+         */
+        private final long[] buckets = new long[TABLE_CAPACITY * 3];
+
+        int keysCount = 0;
+
+        public void add(long str, short value) {
+            int hash = calculateLongHash(str);
+            int bucketIdx = hash & TABLE_CAPACITY_MASK;
+
+            long bucketStr = buckets[bucketIdx * 3];
+            if (bucketStr == str) {
+                updateBucket(bucketIdx, value);
+            }
+            else if (bucketStr == 0L) {
+                createBucket(bucketIdx, str, value);
+                keysCount++;
+            }
+            else {
+                addWithProbing(str, value, (bucketIdx + 1) & TABLE_CAPACITY_MASK);
+            }
+        }
+
+        private void addWithProbing(long str, short value, int bucketIdx) {
+            int distance = 1;
+            while (true) {
+                long bucketStr = buckets[bucketIdx * 3];
+                if (bucketStr == str) {
+                    updateBucket(bucketIdx, value);
+                    break;
+                }
+                else if (bucketStr == 0L) {
+                    createBucket(bucketIdx, str, value);
+                    keysCount++;
+                    break;
+                }
+                else {
+                    distance++;
+                    // A new bucket index is calculated based on quadratic probing https://thenumb.at/Hashtables/#quadratic-probing
+                    // Quadratic probing decreases the number of collisions and max probing distance.
+                    // Linear:
+                    // - capacity 16k, 28.6M collisions, 14-17 max distance
+                    // - capacity 32k, 9.5M collisions, 5-7 max distance
+                    // Quadratic:
+                    // - capacity 16k 25M collisions, 8-10 max distance
+                    // - capacity 32k, 9.3M collisions, 4-7 max distance
+                    bucketIdx = (bucketIdx + distance) & TABLE_CAPACITY_MASK;
+                }
+            }
+        }
+
+        public void addRows(TreeMap<String, ResultRow> result) {
+            for (int bucketIdx = 0; bucketIdx < TABLE_CAPACITY; bucketIdx++) {
+                int bucketOffset = bucketIdx * 3;
+                long str = buckets[bucketOffset];
+                if (str != 0L) {
+                    long sum = buckets[bucketOffset + 1];
+                    long countMinMax = buckets[bucketOffset + 2];
+                    int count = (int) ((countMinMax >> 32));
+                    short min = (short) ((countMinMax >> 16) & 0xFFFF);
+                    short max = (short) (countMinMax & 0xFFFF);
+
+                    result.compute(longToString(str), (k, resultRow) -> {
+                        if (resultRow == null) {
+                            return new ResultRow(sum, count, min, max);
+                        }
+                        else {
+                            resultRow.add(sum, count, min, max);
+                            return resultRow;
+                        }
+                    });
+                }
+            }
+        }
+
+        private void createBucket(int bucketIdx, long str, short value) {
+            int offset = bucketIdx * 3;
+            buckets[offset] = str;
+            buckets[offset + 1] = value;
+            buckets[offset + 2] = (1L << 32) | ((long) (value & 0xFFFF) << 16) | (long) (value & 0xFFFF);
+        }
+
+        private void updateBucket(int bucketIdx, short value) {
+            int offset = bucketIdx * 3;
+            long sum = buckets[offset + 1];
+            buckets[offset + 1] = sum + value;
+
+            long countMinMax = buckets[offset + 2];
+            int count = (int) ((countMinMax >> 32));
+            short min = (short) ((countMinMax >> 16) & 0xFFFF);
+            short max = (short) (countMinMax & 0xFFFF);
+            if (value < min) {
+                min = value;
+            }
+            if (value > max) {
+                max = value;
+            }
+            buckets[offset + 2] = ((long) (count + 1) << 32) | ((long) (min & 0xFFFF) << 16) | (long) (max & 0xFFFF);
+        }
+    }
+
+    /**
+     * An open addressing hash table that stores strings as byte arrays.
+     * Stores hashes.
+     */
+    private static final class RegularTable {
+        private static final int TABLE_CAPACITY = 16384;
+        private static final int TABLE_CAPACITY_MASK = TABLE_CAPACITY - 1;
+        private final Bucket[] buckets = new Bucket[TABLE_CAPACITY];
+
+        int keysCount = 0;
+
+        public void add(byte[] data, int start, int stringLength, int hash, short value) {
+            int bucketIdx = hash & TABLE_CAPACITY_MASK;
+
+            var bucket = buckets[bucketIdx];
+            if (bucket == null) {
+                buckets[bucketIdx] = new Bucket(data, start, stringLength, hash, value);
+                keysCount++;
+            }
+            else if (hash == bucket.hash && bucket.isEqual(data, start, stringLength)) {
+                bucket.update(value);
+            }
+            else {
+                addWithProbing(data, start, stringLength, hash, value, (bucketIdx + 1) & TABLE_CAPACITY_MASK);
+            }
+        }
+
+        private void addWithProbing(byte[] data, int start, int stringLength, int hash, short value, int bucketIdx) {
+            int distance = 1;
+            while (true) {
+                var bucket = buckets[bucketIdx];
+                if (bucket == null) {
+                    buckets[bucketIdx] = new Bucket(data, start, stringLength, hash, value);
+                    keysCount++;
+                    break;
+                }
+                else if (hash == bucket.hash && bucket.isEqual(data, start, stringLength)) {
+                    bucket.update(value);
+                    break;
+                }
+                else {
+                    distance++;
+                    bucketIdx = (bucketIdx + distance) & TABLE_CAPACITY_MASK;
+                }
+            }
+        }
+
+        public void addRows(TreeMap<String, ResultRow> result) {
+            for (var bucket : buckets) {
+                if (bucket != null) {
+                    result.compute(new String(bucket.str, StandardCharsets.UTF_8), (k, resultRow) -> {
+                        if (resultRow == null) {
+                            return new ResultRow(bucket.sum, bucket.count, bucket.min, bucket.max);
+                        }
+                        else {
+                            resultRow.add(bucket.sum, bucket.count, bucket.min, bucket.max);
+                            return resultRow;
+                        }
+                    });
+                }
+            }
+        }
+
+        private static final class Bucket {
+            int hash;
+            byte[] str;
+            long sum;
+            int count;
+            short max = Short.MIN_VALUE;
+            short min = Short.MAX_VALUE;
+
+            Bucket(byte[] data, int start, int stringLength, int hash, short value) {
+                this.str = new byte[stringLength];
+                System.arraycopy(data, start, this.str, 0, stringLength);
+                this.hash = hash;
+                update(value);
+            }
+
+            public void update(short value) {
+                this.sum += value;
+                this.count++;
+                if (max < value)
+                    max = value;
+                if (min > value)
+                    min = value;
+            }
+
+            public boolean isEqual(byte[] data, int start, int length) {
+                if (str.length != length)
+                    return false;
+                int i = 0;
+                for (; i + Long.BYTES < str.length; i += Long.BYTES) {
+                    long value1 = (long) LONG_VIEW.get(str, i);
+                    long value2 = (long) LONG_VIEW.get(data, start + i);
+                    if (value1 != value2)
+                        return false;
+                }
+                if (i + Integer.BYTES < str.length) {
+                    int value1 = (int) INT_VIEW.get(str, i);
+                    int value2 = (int) INT_VIEW.get(data, start + i);
+                    if (value1 != value2)
+                        return false;
+                    i += Integer.BYTES;
+                }
+                for (; i < str.length; i++) {
+                    if (data[start + i] != str[i])
+                        return false;
+                }
+                return true;
+            }
+        }
+    }
+
+    private static class ResultRow {
+        long sum;
+        int count;
+        short min;
+        short max;
+
+        public ResultRow(long sum, int count, short min, short max) {
+            this.sum = sum;
+            this.count = count;
+            this.min = min;
+            this.max = max;
+        }
+
+        public void add(long anotherSum, int anotherCount, short anotherMin, short anotherMax) {
+            sum += anotherSum;
+            count += anotherCount;
+            if (max < anotherMax)
+                max = anotherMax;
+            if (min > anotherMin)
+                min = anotherMin;
+        }
+
+        public String toString() {
+            return Math.round((double) min) / 10.0 + "/"
+                    + Math.round((double) sum / count) / 10.0 + "/"
+                    + Math.round((double) max) / 10.0;
+        }
+    }
+
+    /**
+     * A utility class that uses the RandomAccessFile to read at offset.
+     * Keeps the in-memory buffer, as well as current offsets in the buffer and the file.
+     */
+    private static final class BufferedFile implements AutoCloseable {
+        private static final int BUFFER_SIZE = 512 * 1024;
+        private final byte[] buffer = new byte[BUFFER_SIZE];
+        private int bufferLimit = 0;
+        private int bufferPosition = 0;
+        private final long maxOffset;
+        private final RandomAccessFile file;
+        private long offset;
+
+        private BufferedFile(long startOffset, long maxOffset) throws FileNotFoundException {
+            this.offset = startOffset;
+            this.maxOffset = maxOffset;
+            this.file = new RandomAccessFile(FILE, "r");
+        }
+
+        private void refillBuffer() {
+            int remainingBytes = bufferLimit - bufferPosition;
+            if (remainingBytes < MAX_LINE_LENGTH) {
+                bufferPosition = 0;
+                int bytesRead;
+                try {
+                    file.seek(offset);
+                    bytesRead = file.read(buffer, 0, BUFFER_SIZE);
+                }
+                catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+                if (bytesRead > 0) {
+                    bufferLimit = bytesRead;
+                }
+                else {
+                    bufferLimit = 0;
+                }
+            }
+        }
+
+        @Override
+        public void close() throws Exception {
+            file.close();
+        }
+    }
+
+}

From d9604d9258ce29d955d89ce3f6d72dfd5119a42a Mon Sep 17 00:00:00 2001
From: tivrfoa <lescoutinhovr@gmail.com>
Date: Sat, 27 Jan 2024 15:41:00 -0300
Subject: [PATCH 151/268] Use LinkedBlockingQueue to process results - based on
 thomaswue (#603)

/**
 * Solution based on thomaswue solution, commit:
 * commit d0a28599c293d3afe3291fc3cf169a7b25ae9ae6
 * Author: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
 * Date:   Sun Jan 21 20:13:48 2024 +0100
 *
 * Changes:
 *   1) Use LinkedBlockingQueue to store partial results, that
 *   will then be merged into the final map later.
 *   As different chunks finish at different times, this allows
 *   to process them as they finish, instead of joining the
 *   threads sequentially.
 *     This change seems more useful for the 10k dataset, as the
 *   runtime difference of each chunk is greater.
 *   2) Use only 4 threads if the file is >= 14GB.
 *   This showed much better results on my local test, but I only
 *   run with 200 million rows (because of limited RAM), and I have
 *   no idea how it will perform on the 1brc HW.
 */
---
 calculate_average_tivrfoa.sh                  |  24 +
 prepare_tivrfoa.sh                            |  26 ++
 .../onebrc/CalculateAverage_tivrfoa.java      | 431 ++++++++++++++++++
 3 files changed, 481 insertions(+)
 create mode 100755 calculate_average_tivrfoa.sh
 create mode 100755 prepare_tivrfoa.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java

diff --git a/calculate_average_tivrfoa.sh b/calculate_average_tivrfoa.sh
new file mode 100755
index 000000000..cec66fdd8
--- /dev/null
+++ b/calculate_average_tivrfoa.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+if [ -f target/CalculateAverage_tivrfoa_image ]; then
+    target/CalculateAverage_tivrfoa_image
+else
+    JAVA_OPTS="--enable-preview"
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_tivrfoa
+fi
+
diff --git a/prepare_tivrfoa.sh b/prepare_tivrfoa.sh
new file mode 100755
index 000000000..7cbf309e5
--- /dev/null
+++ b/prepare_tivrfoa.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_tivrfoa_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_tivrfoa\$Scanner"
+    # Use -H:MethodFilter=CalculateAverage_tivrfoa.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_tivrfoa_image dev.morling.onebrc.CalculateAverage_tivrfoa
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java b/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
new file mode 100644
index 000000000..a1b48441f
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
@@ -0,0 +1,431 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.LinkedBlockingQueue;
+
+/**
+ * Solution based on thomaswue solution, commit:
+ * commit d0a28599c293d3afe3291fc3cf169a7b25ae9ae6
+ * Author: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
+ * Date:   Sun Jan 21 20:13:48 2024 +0100
+ *
+ * Changes:
+ *   1) Use LinkedBlockingQueue to store partial results, that
+ *   will then be merged into the final map later.
+ *   As different chunks finish at different times, this allows
+ *   to process them as they finish, instead of joining the
+ *   threads sequentially.
+ *     This change seems more useful for the 10k dataset, as the
+ *   runtime difference of each chunk is greater.
+ *   2) Use only 4 threads if the file is >= 14GB.
+ *   This showed much better results on my local test, but I only
+ *   run with 200 million rows (because of limited RAM), and I have
+ *   no idea how it will perform on the 1brc HW.
+ */
+public class CalculateAverage_tivrfoa {
+    private static final String FILE = "./measurements.txt";
+    private static LinkedBlockingQueue<List<Result>> partialResultQueue;
+    private static int C = 10_000;
+    private static final int MIN_TEMP = -999;
+    private static final int MAX_TEMP = 999;
+
+    // Holding the current result for a single city.
+    private static class Result {
+        long lastNameLong, secondLastNameLong;
+        long[] name;
+        int count;
+        short min, max;
+        long sum;
+
+        private Result() {
+            this.min = MAX_TEMP;
+            this.max = MIN_TEMP;
+        }
+
+        public String toString() {
+            return round(((double) min) / 10.0) + "/" + round((((double) sum) / 10.0) / count) + "/" + round(((double) max) / 10.0);
+        }
+
+        private static double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        // Accumulate another result into this one.
+        private void add(Result other) {
+            if (other.min < min) {
+                min = other.min;
+            }
+            if (other.max > max) {
+                max = other.max;
+            }
+            sum += other.sum;
+            count += other.count;
+        }
+
+        public String calcName() {
+            ByteBuffer bb = ByteBuffer.allocate(name.length * Long.BYTES).order(ByteOrder.nativeOrder());
+            bb.asLongBuffer().put(name);
+            byte[] array = bb.array();
+            int i = 0;
+            while (array[i++] != ';')
+                ;
+            return new String(array, 0, i - 1, StandardCharsets.UTF_8);
+        }
+    }
+
+    private static final class SolveChunk extends Thread {
+        private long chunkStart, chunkEnd;
+
+        public SolveChunk(long chunkStart, long chunkEnd) {
+            this.chunkStart = chunkStart;
+            this.chunkEnd = chunkEnd;
+        }
+
+        @Override
+        public void run() {
+            try {
+                partialResultQueue.put(parseLoop(chunkStart, chunkEnd));
+            }
+            catch (Exception e) {
+                e.printStackTrace();
+                System.exit(1);
+            }
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        boolean runTrick = true;
+        for (var arg : args) {
+            if (arg.equals("--worker")) {
+                runTrick = false;
+                break;
+            }
+        }
+        if (runTrick) {
+            spawnWorker();
+            return;
+        }
+        final int cpus = Runtime.getRuntime().availableProcessors();
+        final long[] chunks = getSegments(cpus);
+        final int workers = chunks.length - 1;
+        partialResultQueue = new LinkedBlockingQueue<>(workers);
+        final SolveChunk[] threads = new SolveChunk[workers];
+        for (int i = 0; i < workers; i++) {
+            threads[i] = new SolveChunk(chunks[i], chunks[i + 1]);
+            threads[i].start();
+        }
+        final TreeMap<String, Result> ret = new TreeMap<>();
+        for (int i = 0; i < workers; ++i) {
+            accumulateResults(ret, partialResultQueue.take());
+        }
+        System.out.println(ret);
+        System.out.close();
+    }
+
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .inheritIO()
+                .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
+    private static void accumulateResults(TreeMap<String, Result> result, List<Result> newResult) {
+        for (Result r : newResult) {
+            String name = r.calcName();
+            Result current = result.putIfAbsent(name, r);
+            if (current != null) {
+                current.add(r);
+            }
+        }
+    }
+
+    // Main parse loop.
+    private static ArrayList<Result> parseLoop(long chunkStart, long chunkEnd) {
+        ArrayList<Result> ret = new ArrayList<>(C);
+        Result[] results = new Result[1 << 17];
+        Scanner scanner = new Scanner(chunkStart, chunkEnd);
+        long word = scanner.getLong();
+        long pos = findDelimiter(word);
+        while (scanner.hasNext()) {
+            long nameAddress = scanner.pos();
+            long hash = 0;
+
+            // Search for ';', one long at a time.
+            if (pos != 0) {
+                pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                scanner.add(pos);
+                word = mask(word, pos);
+                hash = word;
+
+                int number = scanNumber(scanner);
+                long nextWord = scanner.getLong();
+                long nextPos = findDelimiter(nextWord);
+
+                Result existingResult = results[hashToIndex(hash, results)];
+                if (existingResult != null && existingResult.lastNameLong == word) {
+                    word = nextWord;
+                    pos = nextPos;
+                    record(existingResult, number);
+                    continue;
+                }
+
+                scanner.setPos(nameAddress + pos);
+            }
+            else {
+                scanner.add(8);
+                hash = word;
+                long prevWord = word;
+                word = scanner.getLong();
+                pos = findDelimiter(word);
+                if (pos != 0) {
+                    pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                    scanner.add(pos);
+                    word = mask(word, pos);
+                    hash ^= word;
+
+                    Result existingResult = results[hashToIndex(hash, results)];
+                    if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
+                        int number = scanNumber(scanner);
+                        word = scanner.getLong();
+                        pos = findDelimiter(word);
+                        record(existingResult, number);
+                        continue;
+                    }
+                }
+                else {
+                    scanner.add(8);
+                    hash ^= word;
+                    while (true) {
+                        word = scanner.getLong();
+                        pos = findDelimiter(word);
+                        if (pos != 0) {
+                            pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                            scanner.add(pos);
+                            word = mask(word, pos);
+                            hash ^= word;
+                            break;
+                        }
+                        else {
+                            scanner.add(8);
+                            hash ^= word;
+                        }
+                    }
+                }
+            }
+
+            // Save length of name for later.
+            int nameLength = (int) (scanner.pos() - nameAddress);
+            int number = scanNumber(scanner);
+
+            // Final calculation for index into hash table.
+            int tableIndex = hashToIndex(hash, results);
+            outer: while (true) {
+                Result existingResult = results[tableIndex];
+                if (existingResult == null) {
+                    existingResult = newEntry(results, nameAddress, tableIndex, nameLength, scanner);
+                    ret.add(existingResult);
+                }
+                // Check for collision.
+                int i = 0;
+                int namePos = 0;
+                for (; i < nameLength + 1 - 8; i += 8) {
+                    if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
+                        tableIndex = (tableIndex + 31) & (results.length - 1);
+                        continue outer;
+                    }
+                }
+
+                int remainingShift = (64 - (nameLength + 1 - i) << 3);
+                if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
+                    record(existingResult, number);
+                    break;
+                }
+                else {
+                    // Collision error, try next.
+                    tableIndex = (tableIndex + 31) & (results.length - 1);
+                }
+            }
+
+            word = scanner.getLong();
+            pos = findDelimiter(word);
+        }
+        return ret;
+    }
+
+    private static int scanNumber(Scanner scanPtr) {
+        scanPtr.add(1);
+        long numberWord = scanPtr.getLong();
+        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+        int number = convertIntoNumber(decimalSepPos, numberWord);
+        scanPtr.add((decimalSepPos >>> 3) + 3);
+        return number;
+    }
+
+    private static void record(Result existingResult, int number) {
+        if (number < existingResult.min) {
+            existingResult.min = (short) number;
+        }
+        if (number > existingResult.max) {
+            existingResult.max = (short) number;
+        }
+        existingResult.sum += number;
+        existingResult.count++;
+    }
+
+    private static int hashToIndex(long hash, Result[] results) {
+        int hashAsInt = (int) (hash ^ (hash >>> 28));
+        int finalHash = (hashAsInt ^ (hashAsInt >>> 17));
+        return (finalHash & (results.length - 1));
+    }
+
+    private static long mask(long word, long pos) {
+        return (word << ((7 - pos) << 3));
+    }
+
+    // Special method to convert a number in the ascii number into an int without branches created by Quan Anh Mai.
+    private static int convertIntoNumber(int decimalSepPos, long numberWord) {
+        int shift = 28 - decimalSepPos;
+        // signed is -1 if negative, 0 otherwise
+        long signed = (~numberWord << 59) >> 63;
+        long designMask = ~(signed & 0xFF);
+        // Align the number to a specific position and transform the ascii to digit value
+        long digits = ((numberWord & designMask) << shift) & 0x0F000F0F00L;
+        // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+        // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+        // 0x000000UU00TTHH00 + 0x00UU00TTHH000000 * 10 + 0xUU00TTHH00000000 * 100
+        long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+        long value = (absValue ^ signed) - signed;
+        return (int) value;
+    }
+
+    private static long findDelimiter(long word) {
+        long input = word ^ 0x3B3B3B3B3B3B3B3BL;
+        long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
+        return tmp;
+    }
+
+    private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner) {
+        Result r = new Result();
+        results[hash] = r;
+        long[] name = new long[(nameLength / Long.BYTES) + 1];
+        int pos = 0;
+        int i = 0;
+        for (; i < nameLength + 1 - Long.BYTES; i += Long.BYTES) {
+            name[pos++] = scanner.getLongAt(nameAddress + i);
+        }
+
+        if (pos > 0) {
+            r.secondLastNameLong = name[pos - 1];
+        }
+
+        int remainingShift = (64 - (nameLength + 1 - i) << 3);
+        long lastWord = (scanner.getLongAt(nameAddress + i) << remainingShift);
+        r.lastNameLong = lastWord;
+        name[pos] = lastWord >> remainingShift;
+        r.name = name;
+        return r;
+    }
+
+    private static long[] getSegments(int cpus) throws IOException {
+        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+            long fileSize = fileChannel.size();
+            int numberOfChunks = cpus / 2;
+            if (fileSize < (int) 14e9) {
+                C = 500;
+                numberOfChunks = cpus;
+            }
+            long segmentSize = (fileSize + numberOfChunks - 1) / numberOfChunks;
+            long[] chunks = new long[numberOfChunks + 1];
+            long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
+            chunks[0] = mappedAddress;
+            long endAddress = mappedAddress + fileSize;
+            Scanner s = new Scanner(mappedAddress, mappedAddress + fileSize);
+            for (int i = 1; i < numberOfChunks; ++i) {
+                long chunkAddress = mappedAddress + i * segmentSize;
+                // Align to first row start.
+                while (chunkAddress < endAddress && (s.getLongAt(chunkAddress++) & 0xFF) != '\n')
+                    ;
+                chunks[i] = Math.min(chunkAddress, endAddress);
+            }
+            chunks[numberOfChunks] = endAddress;
+            return chunks;
+        }
+    }
+
+    private static class Scanner {
+
+        private static final sun.misc.Unsafe UNSAFE = initUnsafe();
+
+        private static sun.misc.Unsafe initUnsafe() {
+            try {
+                java.lang.reflect.Field theUnsafe = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
+                theUnsafe.setAccessible(true);
+                return (sun.misc.Unsafe) theUnsafe.get(sun.misc.Unsafe.class);
+            }
+            catch (NoSuchFieldException | IllegalAccessException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        long pos, end;
+
+        public Scanner(long start, long end) {
+            this.pos = start;
+            this.end = end;
+        }
+
+        boolean hasNext() {
+            return pos < end;
+        }
+
+        long pos() {
+            return pos;
+        }
+
+        void add(long delta) {
+            pos += delta;
+        }
+
+        long getLong() {
+            return UNSAFE.getLong(pos);
+        }
+
+        long getLongAt(long pos) {
+            return UNSAFE.getLong(pos);
+        }
+
+        void setPos(long l) {
+            this.pos = l;
+        }
+    }
+}
\ No newline at end of file

From 8279aa7560833ef61653a175a8864fc985b5defb Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Sat, 27 Jan 2024 18:43:41 +0000
Subject: [PATCH 152/268] Simplify dedupeStation() (#589)

13.8s locally now.

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 .../CalculateAverage_ianopolousfast.java      | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index ab960dfec..28f62a4dd 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -19,6 +19,7 @@
 import jdk.incubator.vector.VectorOperators;
 import jdk.incubator.vector.VectorSpecies;
 
+import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.nio.ByteOrder;
@@ -41,7 +42,7 @@
  *
  * Timings on 4 core i7-7500U CPU @ 2.70GHz:
  * average_baseline: 4m48s
- * ianopolous:         14s
+ * ianopolous:         13.8s
 */
 public class CalculateAverage_ianopolousfast {
 
@@ -107,22 +108,15 @@ public static Stat createStation(long start, long end, MemorySegment buffer) {
     public static Stat dedupeStation(long start, long end, long hash, MemorySegment buffer, Stat[] stations) {
         int index = hashToIndex(hash, MAX_STATIONS);
         Stat match = stations[index];
-        if (match == null) {
-            Stat res = createStation(start, end, buffer);
-            stations[index] = res;
-            return res;
-        }
-        else {
-            while (match != null) {
-                if (matchingStationBytes(start, end, buffer, match))
-                    return match;
-                index = (index + 1) % stations.length;
-                match = stations[index];
-            }
-            Stat res = createStation(start, end, buffer);
-            stations[index] = res;
-            return res;
+        while (match != null) {
+            if (matchingStationBytes(start, end, buffer, match))
+                return match;
+            index = (index + 1) % stations.length;
+            match = stations[index];
         }
+        Stat res = createStation(start, end, buffer);
+        stations[index] = res;
+        return res;
     }
 
     static long maskHighBytes(long d, int nbytes) {

From fddf5326cff85cacd9462ab2f0a3158ce2d65308 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 09:29:15 +0100
Subject: [PATCH 153/268] Leaderboard update

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 34f7a6cff..30b19b9d8 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
+|   | 00:04.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
@@ -69,12 +70,13 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
-|   | 00:05.400 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
+|   | 00:05.387 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
 |   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:05.709 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
+|   | 00:05.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |
@@ -131,7 +133,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
 |   | 00:16.063 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java)| 21.0.1-open | [Marek Kohn](https://github.com/makohn) |  |
 |   | 00:16.953 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java)| 21.0.1-open | [Gaurav Anantrao Deshmukh](https://github.com/gauravdeshmukh) |  |
-|   | 00:17.179 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java)| 21.0.1-open | [Farid](https://github.com/faridtmammadov) |  |
@@ -145,6 +146,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:19.357 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_truelive.java)| 21.0.1-graalce | [Roman Schweitzer](https://github.com/truelive) |  |
 |   | 00:20.691 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java)| 21.0.1-graal | [Kidlike](https://github.com/Kidlike) | GraalVM native binary |
 |   | 00:21.989 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_couragelee.java)| 21.0.1-open | [couragelee](https://github.com/couragelee) |  |
+|   | 00:22.188 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
 |   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
 |   | 00:24.986 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |

From 9dde50872f6a2c3ca33c5d3b088b963e14c99798 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 09:46:25 +0100
Subject: [PATCH 154/268] Leaderboard update;

- Had used wrong link for Subrahmanyam non-idiomatic at first
- Adding 10K key set eval using Subrahmanyam non-idiomatic
---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 30b19b9d8..ccae6c3ce 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
-|   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
+|   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
 |   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
@@ -239,6 +239,7 @@ using eight cores on the evaluation machine:
 | 1 | 00:02.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
 | 2 | 00:04.001 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 | 3 | 00:04.516 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+|   | 00:04.816 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) | uses Unsafe |
 |   | 00:04.848 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:05.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
 |   | 00:05.614 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
@@ -253,13 +254,11 @@ using eight cores on the evaluation machine:
 |   | 00:15.896 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:18.064 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:20.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
-|   | 04:11.062 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | ---       | | | | |
 |   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | Incorrect output |
 |   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | Didn't complete in 60 sec |
 |   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | Seg fault |
 
-
 ## Prerequisites
 
 [Java 21](https://openjdk.org/projects/jdk/21/) must be installed on your system.

From 3e208be74192c5005282a0eb7ad3adeaed3b17cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <serkanozal86@gmail.com>
Date: Sun, 28 Jan 2024 11:53:09 +0300
Subject: [PATCH 155/268] serkan-ozal: Initial impl (#553)

* Initial impl

* Fix bad file descriptor error in the `calculate_average_serkan-ozal.sh`

* Disable Epsilon GC and rely on default GC. Because apparently, JIT and Epsilon GC don't play well together in the eval machine for short lived Vector API's `ByteVector` objects

* Take care of byte order before processing key length with bit shift operators

* Fix key equality check for long keys
---
 calculate_average_serkan-ozal.sh              |  30 +
 .../onebrc/CalculateAverage_serkan_ozal.java  | 674 ++++++++++++++++++
 2 files changed, 704 insertions(+)
 create mode 100755 calculate_average_serkan-ozal.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java

diff --git a/calculate_average_serkan-ozal.sh b/calculate_average_serkan-ozal.sh
new file mode 100755
index 000000000..a903c1d39
--- /dev/null
+++ b/calculate_average_serkan-ozal.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector"
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:MaxInlineSize=10000 -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+JAVA_OPTS="$JAVA_OPTS -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0"
+#JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xms256m -Xmx256m -XX:+AlwaysPreTouch"
+if [[ ! "$(uname -s)" = "Darwin" ]]; then
+  JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+fi
+
+#echo "Process started at $(date +%s%N | cut -b1-13)"
+eval "exec 3< <({ CLOSE_STDOUT_ON_RESULT=true USE_SHARED_ARENA=true java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_serkan_ozal; })"
+read <&3 result
+echo -e "$result"
+#echo "Process finished at $(date +%s%N | cut -b1-13)"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
new file mode 100644
index 000000000..b02538358
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -0,0 +1,674 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.reflect.Field;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+/**
+ * @author serkan-ozal
+ */
+public class CalculateAverage_serkan_ozal {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED.length() >= 16
+            // Since majority (99%) of the city names <= 16 bytes, according to my experiments,
+            // 128 bit (16 byte) vectors perform better than 256 bit (32 byte) or 512 bit (64 byte) vectors
+            // even though supported by platform.
+            ? ByteVector.SPECIES_128
+            : ByteVector.SPECIES_64;
+    private static final int BYTE_SPECIES_SIZE = BYTE_SPECIES.vectorByteSize();
+
+    private static final ByteOrder NATIVE_BYTE_ORDER = ByteOrder.nativeOrder();
+    private static final char NEW_LINE_SEPARATOR = '\n';
+    private static final char KEY_VALUE_SEPARATOR = ';';
+    private static final int MAX_LINE_LENGTH = 128;
+
+    // Get configurations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    private static final boolean VERBOSE = getBooleanConfig("VERBOSE", false);
+    private static final int THREAD_COUNT = getIntegerConfig("THREAD_COUNT", Runtime.getRuntime().availableProcessors());
+    private static final boolean USE_VTHREADS = getBooleanConfig("USE_VTHREADS", false);
+    private static final int VTHREAD_COUNT = getIntegerConfig("VTHREAD_COUNT", 1024);
+    private static final int REGION_COUNT = getIntegerConfig("REGION_COUNT", -1);
+    private static final boolean USE_SHARED_ARENA = getBooleanConfig("USE_SHARED_ARENA", false);
+    private static final int MAP_CAPACITY = getIntegerConfig("MAP_CAPACITY", 1 << 17);
+    private static final boolean CLOSE_STDOUT_ON_RESULT = getBooleanConfig("CLOSE_STDOUT_ON_RESULT", false);
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // My dear old friend Unsafe
+    private static final Unsafe U;
+
+    static {
+        try {
+            Field f = Unsafe.class.getDeclaredField("theUnsafe");
+            f.setAccessible(true);
+            U = (Unsafe) f.get(null);
+        }
+        catch (Exception e) {
+            throw new IllegalStateException(e);
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        long start = System.currentTimeMillis();
+        if (VERBOSE) {
+            System.out.println("Processing started at " + start);
+            System.out.println("Vector byte size: " + BYTE_SPECIES.vectorByteSize());
+            System.out.println("Use shared memory arena: " + USE_SHARED_ARENA);
+            if (USE_VTHREADS) {
+                System.out.println("Virtual thread count: " + VTHREAD_COUNT);
+            }
+            else {
+                System.out.println("Thread count: " + THREAD_COUNT);
+            }
+            System.out.println("Map capacity: " + MAP_CAPACITY);
+        }
+
+        int concurrency = USE_VTHREADS ? VTHREAD_COUNT : THREAD_COUNT;
+        int regionCount = REGION_COUNT > 0 ? REGION_COUNT : concurrency;
+        ByteBuffer lineBuffer = getByteBuffer(MAX_LINE_LENGTH);
+        Result result = new Result();
+
+        RandomAccessFile file = new RandomAccessFile(FILE, "r");
+        FileChannel fc = file.getChannel();
+        Arena arena = USE_SHARED_ARENA ? Arena.ofShared() : null;
+        try {
+            long fileSize = fc.size();
+            long regionSize = fileSize / regionCount;
+            long startPos = 0;
+            ExecutorService executor = USE_VTHREADS
+                    ? Executors.newVirtualThreadPerTaskExecutor()
+                    : Executors.newFixedThreadPool(concurrency, new RegionProcessorThreadFactory());
+
+            // Split whole file into regions and start region processors to handle those regions
+            List<Future<Response>> futures = new ArrayList<>(regionCount);
+            for (int i = 0; i < regionCount; i++) {
+                long endPos = Math.min(fileSize, startPos + regionSize);
+                // Lines might split into different regions.
+                // If so, move back to the line starting at the end of previous region
+                long closestLineEndPos = (i < regionCount - 1)
+                        ? findClosestLineEnd(fc, endPos, lineBuffer)
+                        : fileSize;
+                Request request = new Request(fc, arena, startPos, closestLineEndPos, result);
+                RegionProcessor regionProcessor = createRegionProcessor(request);
+                Future<Response> future = executor.submit(regionProcessor);
+                futures.add(future);
+                startPos = closestLineEndPos;
+            }
+
+            // Wait processors to complete
+            for (Future<Response> future : futures) {
+                future.get();
+            }
+
+            long finish = System.currentTimeMillis();
+            if (VERBOSE) {
+                System.out.println("Processing completed at " + finish);
+                System.out.println("Processing completed in " + (finish - start) + " milliseconds");
+            }
+
+            // Print result to stdout
+            result.print();
+
+            if (CLOSE_STDOUT_ON_RESULT) {
+                // After printing result, close stdout.
+                // So parent process can complete without waiting this process completed.
+                // Saves a few hundred milliseconds caused by unmap.
+                System.out.close();
+            }
+        }
+        finally {
+            // Close memory arena if it is managed globally here (shared arena)
+            if (arena != null) {
+                arena.close();
+            }
+            fc.close();
+            if (VERBOSE) {
+                long finish = System.currentTimeMillis();
+                System.out.println("All completed at " + finish);
+                System.out.println("All Completed in " + ((finish - start)) + " milliseconds");
+            }
+        }
+    }
+
+    private static boolean getBooleanConfig(String envVarName, boolean defaultValue) {
+        String envVarValue = System.getenv(envVarName);
+        if (envVarValue == null) {
+            return defaultValue;
+        }
+        else {
+            return Boolean.parseBoolean(envVarValue);
+        }
+    }
+
+    private static int getIntegerConfig(String envVarName, int defaultValue) {
+        String envVarValue = System.getenv(envVarName);
+        if (envVarValue == null) {
+            return defaultValue;
+        }
+        else {
+            return Integer.parseInt(envVarValue);
+        }
+    }
+
+    private static ByteBuffer getByteBuffer(int size) {
+        ByteBuffer bb = ByteBuffer.allocateDirect(size);
+        bb.order(NATIVE_BYTE_ORDER);
+        return bb;
+    }
+
+    private static long findClosestLineEnd(FileChannel fc, long endPos, ByteBuffer lineBuffer) throws IOException {
+        long lineCheckStartPos = Math.max(0, endPos - MAX_LINE_LENGTH);
+        lineBuffer.rewind();
+        fc.read(lineBuffer, lineCheckStartPos);
+        int i = MAX_LINE_LENGTH;
+        while (lineBuffer.get(i - 1) != NEW_LINE_SEPARATOR) {
+            i--;
+        }
+        return lineCheckStartPos + i;
+    }
+
+    private static RegionProcessor createRegionProcessor(Request request) {
+        return new RegionProcessor(request);
+    }
+
+    private static class RegionProcessorThreadFactory implements ThreadFactory {
+
+        @Override
+        public Thread newThread(Runnable r) {
+            Thread t = new Thread(r);
+            t.setDaemon(true);
+            t.setPriority(Thread.MAX_PRIORITY);
+            return t;
+        }
+
+    }
+
+    /**
+     * Region processor
+     */
+    private static class RegionProcessor implements Callable<Response> {
+
+        private final FileChannel fc;
+        private final Arena arena;
+        private final long start;
+        private final long end;
+        private final long size;
+        private final OpenMap map;
+        private final Result result;
+
+        private RegionProcessor(Request request) {
+            this.fc = request.fileChannel;
+            this.arena = request.arena;
+            this.start = request.start;
+            this.end = request.end;
+            this.size = end - start;
+            this.map = new OpenMap();
+            this.result = request.result;
+        }
+
+        @Override
+        public Response call() throws Exception {
+            if (VERBOSE) {
+                System.out.println("[Processor-" + Thread.currentThread().getName() + "] Processing started at " + System.currentTimeMillis());
+            }
+            try {
+                processRegion();
+                return new Response(map);
+            }
+            finally {
+                if (VERBOSE) {
+                    System.out.println("[Processor-" + Thread.currentThread().getName() + "] Processing finished at " + System.currentTimeMillis());
+                }
+            }
+        }
+
+        private void processRegion() throws Exception {
+            boolean arenaGiven = arena != null;
+            // If no shared global memory arena is used, create and use its own local memory arena
+            Arena a = arenaGiven ? arena : Arena.ofConfined();
+            try {
+                MemorySegment region = fc.map(FileChannel.MapMode.READ_ONLY, start, size, a);
+
+                doProcessRegion(region);
+                if (VERBOSE) {
+                    System.out.println("[Processor-" + Thread.currentThread().getName() + "] Region processed at " + System.currentTimeMillis());
+                }
+
+                // Some threads/processors might finish slightly before others.
+                // So, instead of releasing their cores idle, merge their own results here.
+
+                // If there is no another processor merging its results now, merge now.
+                // Otherwise (there is already another thread/processor got the lock of merging),
+                // Close current processor's own local memory arena (if no shared global memory arena is used) now
+                // and merge its own results after then.
+
+                boolean merged = result.tryMergeInto(map);
+                if (VERBOSE && merged) {
+                    System.out.println("[Processor-" + Thread.currentThread().getName() + "] Result merged at " + System.currentTimeMillis());
+                }
+                if (!merged) {
+                    if (!arenaGiven) {
+                        a.close();
+                        a = null;
+                        if (VERBOSE) {
+                            System.out.println("[Processor-" + Thread.currentThread().getName() + "] Arena closed at " + System.currentTimeMillis());
+                        }
+                    }
+                    result.mergeInto(map);
+                    if (VERBOSE) {
+                        System.out.println("[Processor-" + Thread.currentThread().getName() + "] Result merged at " + System.currentTimeMillis());
+                    }
+                }
+            }
+            finally {
+                // If local memory arena is managed here and not closed yet, close it here
+                if (!arenaGiven && a != null) {
+                    a.close();
+                    if (VERBOSE) {
+                        System.out.println("[Processor-" + Thread.currentThread().getName() + "] Arena closed at " + System.currentTimeMillis());
+                    }
+                }
+            }
+        }
+
+        private void doProcessRegion(MemorySegment region) {
+            final long regionAddress = region.address();
+            final long regionSize = region.byteSize();
+            final int vectorSize = BYTE_SPECIES.vectorByteSize();
+            final long regionMainLimit = regionSize - MAX_LINE_LENGTH;
+
+            int regionPtr;
+
+            // Read and process region - main
+            for (regionPtr = 0; regionPtr < regionMainLimit;) {
+                regionPtr = doProcessLine(region, regionAddress, vectorSize, regionPtr);
+            }
+
+            // Read and process region - tail
+            for (int i = regionPtr, j = regionPtr; i < regionSize;) {
+                byte b = U.getByte(regionAddress + i);
+                if (b == KEY_VALUE_SEPARATOR) {
+                    long baseOffset = map.putKey(null, regionAddress, j, i - j);
+                    i = extractValue(regionAddress, i + 1, map, baseOffset);
+                    j = i;
+                }
+                else {
+                    i++;
+                }
+            }
+        }
+
+        private int doProcessLine(MemorySegment region, long regionAddress, int vectorSize, int i) {
+            // Find key/value separator
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////
+            int keyStartIdx = i;
+
+            // Vectorized search for key/value separator
+            ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, region, i, NATIVE_BYTE_ORDER);
+            int keyValueSepOffset = keyVector.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
+            // Check whether key/value separator is found in the first vector (city name is <= vector size)
+            if (keyValueSepOffset == vectorSize) {
+                i += vectorSize;
+                keyValueSepOffset = 0;
+                for (; U.getByte(regionAddress + i) != KEY_VALUE_SEPARATOR; i++)
+                    ;
+                // I have tried vectorized search for key/value separator in the remaining part,
+                // but since majority (99%) of the city names <= 16 bytes
+                // and other a few longer city names (have length < 16 and <= 32) not close to 32 bytes,
+                // byte by byte search is better in terms of performance (according to my experiments) and simplicity.
+            }
+            i += keyValueSepOffset;
+            int keyLength = i - keyStartIdx;
+            i++;
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+            // Put key and get map offset to put value
+            long baseOffset = map.putKey(keyVector, regionAddress, keyStartIdx, keyLength);
+
+            // Extract value, put it into map and return next position in the region to continue processing from there
+            return extractValue(regionAddress, i, map, baseOffset);
+        }
+
+    }
+
+    // Credits: merykitty
+    private static int extractValue(long regionAddress, int idx, OpenMap map, long baseOffset) {
+        long word = U.getLong(regionAddress + idx);
+        if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+            word = Long.reverseBytes(word);
+        }
+
+        // Parse and extract value
+        int decimalSepPos = Long.numberOfTrailingZeros(~word & 0x10101000);
+        int shift = 28 - decimalSepPos;
+        long signed = (~word << 59) >> 63;
+        long designMask = ~(signed & 0xFF);
+        long digits = ((word & designMask) << shift) & 0x0F000F0F00L;
+        long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+        int value = (int) ((absValue ^ signed) - signed);
+
+        // Put extracted value into map
+        map.putValue(baseOffset, value);
+
+        // Return new position
+        return idx + (decimalSepPos >>> 3) + 3;
+    }
+
+    /**
+     * Region processor request
+     */
+    private static final class Request {
+
+        private final FileChannel fileChannel;
+        private final Arena arena;
+        private final long start;
+        private final long end;
+        private final Result result;
+
+        private Request(FileChannel fileChannel, Arena arena, long start, long end, Result result) {
+            this.fileChannel = fileChannel;
+            this.arena = arena;
+            this.start = start;
+            this.end = end;
+            this.result = result;
+        }
+
+    }
+
+    /**
+     * Region processor response
+     */
+    private static final class Response {
+
+        private final OpenMap map;
+
+        private Response(OpenMap map) {
+            this.map = map;
+        }
+
+    }
+
+    /**
+     * Result of each key (city)
+     */
+    private static final class KeyResult {
+
+        private int count;
+        private int minValue;
+        private int maxValue;
+        private long sum;
+
+        private KeyResult(int count, int minValue, int maxValue, long sum) {
+            this.count = count;
+            this.minValue = minValue;
+            this.maxValue = maxValue;
+            this.sum = sum;
+        }
+
+        private void merge(KeyResult result) {
+            count += result.count;
+            minValue = Math.min(minValue, result.minValue);
+            maxValue = Math.max(maxValue, result.maxValue);
+            sum += result.sum;
+        }
+
+        @Override
+        public String toString() {
+            return (minValue / 10.0) + "/" + round(sum / (double) (count * 10)) + "/" + (maxValue / 10.0);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+    }
+
+    /**
+     * Global result
+     */
+    private static final class Result {
+
+        private final Lock lock = new ReentrantLock();
+        private final Map<String, KeyResult> resultMap;
+
+        private Result() {
+            this.resultMap = new TreeMap<>();
+        }
+
+        private boolean tryMergeInto(OpenMap map) {
+            // Use lock (not "synchronized" block) to be virtual threads friendly
+            if (!lock.tryLock()) {
+                return false;
+            }
+            try {
+                map.merge(this.resultMap);
+                return true;
+            }
+            finally {
+                lock.unlock();
+            }
+        }
+
+        private void mergeInto(OpenMap map) {
+            // Use lock (not "synchronized" block) to be virtual threads friendly
+            lock.lock();
+            try {
+                map.merge(this.resultMap);
+            }
+            finally {
+                lock.unlock();
+            }
+        }
+
+        private void print() {
+            System.out.println(resultMap);
+        }
+
+    }
+
+    private static final class OpenMap {
+
+        // Layout
+        // ================================
+        // 0 : 4 bytes - count
+        // 4 : 2 bytes - min value
+        // 6 : 2 bytes - max value
+        // 8 : 8 bytes - value sum
+        // 16 : 4 bytes - key size
+        // 20 : 4 bytes - padding
+        // 24 : 100 bytes - key
+        // 124 : 4 bytes - padding
+        // ================================
+        // 128 bytes - total
+
+        private static final int ENTRY_SIZE = 128;
+        private static final int COUNT_OFFSET = 0;
+        private static final int MIN_VALUE_OFFSET = 4;
+        private static final int MAX_VALUE_OFFSET = 6;
+        private static final int VALUE_SUM_OFFSET = 8;
+        private static final int KEY_SIZE_OFFSET = 16;
+        private static final int KEY_OFFSET = 24;
+
+        private static final int ENTRY_HASH_MASK = MAP_CAPACITY - 1;
+        private static final int MAP_SIZE = ENTRY_SIZE * MAP_CAPACITY;
+        private static final int ENTRY_MASK = MAP_SIZE - 1;
+
+        private final byte[] data;
+
+        private OpenMap() {
+            this.data = new byte[MAP_SIZE];
+        }
+
+        // Credits: merykitty
+        private static int calculateKeyHash(long address, int keyLength) {
+            int seed = 0x9E3779B9;
+            int rotate = 5;
+            int x, y;
+            if (keyLength >= Integer.BYTES) {
+                x = U.getInt(address);
+                y = U.getInt(address + keyLength - Integer.BYTES);
+            }
+            else {
+                x = U.getByte(address);
+                y = U.getByte(address + keyLength - Byte.BYTES);
+            }
+            return (Integer.rotateLeft(x * seed, rotate) ^ y) * seed;
+        }
+
+        private long putKey(ByteVector keyVector, long regionAddress, long keyStartIdx, int keyLength) {
+            long keyStartAddress = regionAddress + keyStartIdx;
+            // Calculate hash of key
+            int keyHash = calculateKeyHash(keyStartAddress, keyLength);
+            // and get the position of the entry in the linear map based on calculated hash
+            int idx = keyHash & ENTRY_HASH_MASK;
+
+            // Start searching from the calculated position
+            // and continue until find an available slot in case of hash collision
+            // TODO Prevent infinite loop if all the slots are in use for other keys
+            for (long baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + (idx * ENTRY_SIZE);; baseOffset = (baseOffset + ENTRY_SIZE) & ENTRY_MASK) {
+                int keyStartOffset = (int) baseOffset + KEY_OFFSET;
+                int keySize = U.getInt(data, baseOffset + KEY_SIZE_OFFSET);
+                // Check whether current index is empty (no another key is inserted yet)
+                if (keySize == 0) {
+                    // Initialize entry slot for new key
+                    U.putShort(data, baseOffset + MIN_VALUE_OFFSET, Short.MAX_VALUE);
+                    U.putShort(data, baseOffset + MAX_VALUE_OFFSET, Short.MIN_VALUE);
+                    U.putInt(data, baseOffset + KEY_SIZE_OFFSET, keyLength);
+                    U.copyMemory(null, keyStartAddress, data, keyStartOffset, keyLength);
+                    return baseOffset;
+                }
+                // Check for hash collision (hashes are same, but keys are different).
+                // If there is no collision (both hashes and keys are equals), return current slot's offset.
+                // Otherwise, continue iterating until find an available slot.
+                if (keySize == keyLength && keysEqual(keyVector, keyStartAddress, keyLength, keyStartOffset)) {
+                    return baseOffset;
+                }
+            }
+        }
+
+        private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLength, int keyStartOffset) {
+            int keyCheckIdx = 0;
+            if (keyVector != null) {
+                // Use vectorized search for the comparison of keys.
+                // Since majority of the city names >= 8 bytes and <= 16 bytes,
+                // this way is more efficient (according to my experiments) than any other comparisons (byte by byte or 2 longs).
+                int keyCheckLength = Math.min(BYTE_SPECIES_SIZE, keyLength);
+                ByteVector entryKeyVector = ByteVector.fromArray(BYTE_SPECIES, data, keyStartOffset - Unsafe.ARRAY_BYTE_BASE_OFFSET);
+                long eqMask = keyVector.compare(VectorOperators.EQ, entryKeyVector).toLong();
+                int eqCount = Long.numberOfTrailingZeros(~eqMask);
+                if (eqCount < keyCheckLength) {
+                    return false;
+                }
+                if (keyCheckLength == keyLength) {
+                    return true;
+                }
+                keyCheckIdx = BYTE_SPECIES_SIZE;
+            }
+
+            // Compare remaining parts of the keys
+
+            int normalizedKeyLength = keyLength;
+            if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+                normalizedKeyLength = Integer.reverseBytes(normalizedKeyLength);
+            }
+
+            int alignedKeyLength = normalizedKeyLength & 0xFFFFFFF8;
+            int i;
+            for (i = keyCheckIdx; i < alignedKeyLength; i += Long.BYTES) {
+                if (U.getLong(keyStartAddress + i) != U.getLong(data, keyStartOffset + i)) {
+                    return false;
+                }
+            }
+
+            long wordA = U.getLong(keyStartAddress + i);
+            long wordB = U.getLong(data, keyStartOffset + i);
+            if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+                wordA = Long.reverseBytes(wordA);
+                wordB = Long.reverseBytes(wordB);
+            }
+            int halfShift = (Long.BYTES - (normalizedKeyLength & 0x00000007)) << 2;
+            long mask = (0xFFFFFFFFFFFFFFFFL >>> halfShift) >> halfShift;
+            wordA = wordA & mask;
+            // No need to mask "wordB" (word from key in the map), because it is already padded with 0s
+            return wordA == wordB;
+        }
+
+        private void putValue(long baseOffset, int value) {
+            U.putInt(data, baseOffset + COUNT_OFFSET,
+                    U.getInt(data, baseOffset + COUNT_OFFSET) + 1);
+            U.putShort(data, baseOffset + MIN_VALUE_OFFSET,
+                    (short) Math.min(value, U.getShort(data, baseOffset + MIN_VALUE_OFFSET)));
+            U.putShort(data, baseOffset + MAX_VALUE_OFFSET,
+                    (short) Math.max(value, U.getShort(data, baseOffset + MAX_VALUE_OFFSET)));
+            U.putLong(data, baseOffset + VALUE_SUM_OFFSET,
+                    value + U.getLong(data, baseOffset + VALUE_SUM_OFFSET));
+        }
+
+        private void merge(Map<String, KeyResult> resultMap) {
+            // Merge this local map into global result map
+            for (int i = 0; i < MAP_SIZE; i += ENTRY_SIZE) {
+                int baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + i;
+                int keyLength = U.getInt(data, baseOffset + KEY_SIZE_OFFSET);
+                if (keyLength == 0) {
+                    // No entry is available for this index, so continue iterating
+                    continue;
+                }
+                String key = new String(data, i + KEY_OFFSET, keyLength, StandardCharsets.UTF_8);
+                int count = U.getInt(data, baseOffset + COUNT_OFFSET);
+                short minValue = U.getShort(data, baseOffset + MIN_VALUE_OFFSET);
+                short maxValue = U.getShort(data, baseOffset + MAX_VALUE_OFFSET);
+                long sum = U.getLong(data, baseOffset + VALUE_SUM_OFFSET);
+                KeyResult result = new KeyResult(count, minValue, maxValue, sum);
+                KeyResult existingResult = resultMap.get(key);
+                if (existingResult == null) {
+                    resultMap.put(key, result);
+                }
+                else {
+                    existingResult.merge(result);
+                }
+            }
+        }
+
+    }
+
+}

From 936fc1da5493849d2aaf7f71f00f7f81067b6129 Mon Sep 17 00:00:00 2001
From: Alberto Venturini <aventurini@gmail.com>
Date: Sun, 28 Jan 2024 11:02:42 +0200
Subject: [PATCH 156/268] Second version by albertoventurini (#609)

* Contribution by albertoventurini

* Use byte arrays of size 2^20

---------

Co-authored-by: Alberto Venturini <alberto.venturini@accso.de>
---
 calculate_average_albertoventurini.sh         |  2 +-
 .../CalculateAverage_albertoventurini.java    | 89 ++++++++++++-------
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/calculate_average_albertoventurini.sh b/calculate_average_albertoventurini.sh
index d997264b0..6263e14a0 100755
--- a/calculate_average_albertoventurini.sh
+++ b/calculate_average_albertoventurini.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="-server -Xnoclassgc"
+JAVA_OPTS="-Xnoclassgc"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_albertoventurini
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java b/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java
index 406c75985..91e00e332 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java
@@ -58,31 +58,31 @@ private static final class TrieNode {
 
     // Process a chunk and write results in a Trie rooted at 'root'.
     private static void processChunk(final TrieNode root, final ChunkReader cr) {
-        while (cr.hasNext()) {
+        while (cr.ensureHasMoreRows()) {
             TrieNode node = root;
 
             // Process the location name navigating through the trie
-            int b = cr.getNext() & 0xFF;
-            while (b != ';') {
+            int b = cr.getNext();
+            do {
+                b &= 0xFF;
                 if (node.children[b] == null) {
                     node.children[b] = new TrieNode();
                 }
                 node = node.children[b];
-                b = cr.getNext() & 0xFF;
-            }
+                b = cr.getNext();
+            } while (b != ';');
 
             // Process the reading value (temperature)
-            int reading;
+            final int reading;
 
-            byte b1 = cr.getNext();
-            byte b2 = cr.getNext();
-            byte b3 = cr.getNext();
-            byte b4 = cr.getNext();
+            final byte b1 = cr.getNext();
+            final byte b2 = cr.getNext();
             if (b2 == '.') { // value is n.n
-                reading = (b1 * 10 + b3 - TWO_BYTE_TO_INT);
-                // b4 == \n
+                reading = (b1 * 10 + cr.getNext() - TWO_BYTE_TO_INT);
             }
             else {
+                final byte b3 = cr.getNext();
+                final byte b4 = cr.getNext();
                 if (b4 == '.') { // value is -nn.n
                     reading = -(b2 * 100 + b3 * 10 + cr.getNext() - THREE_BYTE_TO_INT);
                 }
@@ -92,11 +92,15 @@ else if (b1 == '-') { // value is -n.n
                 else { // value is nn.n
                     reading = (b1 * 100 + b2 * 10 + b4 - THREE_BYTE_TO_INT);
                 }
-                cr.getNext(); // new line
             }
+            cr.cursor++; // new line
 
-            node.min = Math.min(node.min, reading);
-            node.max = Math.max(node.max, reading);
+            if (reading < node.min) {
+                node.min = reading;
+            }
+            if (reading > node.max) {
+                node.max = reading;
+            }
             node.sum += reading;
             node.count++;
         }
@@ -165,26 +169,40 @@ private void printResultsRec(final TrieNode[] nodes, final byte[] bytes, final i
                     bytes[index] = (byte) i;
                     printResultsRec(childNodes, bytes, index + 1);
                 }
-
             }
         }
     }
 
     private static final String FILE = "./measurements.txt";
 
+    /**
+     * Read a chunk of a {@link RandomAccessFile} file.
+     * Internally, the chunk is further subdivided into "sub-chunks" (byte arrays).
+     */
     private static final class ChunkReader {
-        // Byte arrays of size 2^22 seem to have the best performance on my machine.
-        private static final int BYTE_ARRAY_SIZE = 1 << 22;
+        // Byte arrays of size 2^20 seem to have the best performance on my machine.
+        private static final int BYTE_ARRAY_SIZE = 1 << 20;
         private final byte[] bytes;
 
         private final RandomAccessFile file;
+
+        // The initial position of this chunk.
         private final long chunkBegin;
+
+        // The length of this chunk.
         private final long chunkLength;
 
-        private int readBytes = 0;
+        // The beginning of the current "sub-chunk", relative to the initial position of the chunk.
+        private long offset = 0;
+
+        // The size of the current "sub-chunk".
+        private int subChunkSize = 0;
 
+        // The current position within the current "sub-chunk".
         private int cursor = 0;
-        private long offset = 0;
+
+        // The maximum size of a row
+        private static final int MAX_ROW_SIZE_BYTES = 107;
 
         ChunkReader(
                     final RandomAccessFile file,
@@ -197,32 +215,43 @@ private static final class ChunkReader {
             int byteArraySize = chunkLength < BYTE_ARRAY_SIZE ? (int) chunkLength : BYTE_ARRAY_SIZE;
             this.bytes = new byte[byteArraySize];
 
-            readNextBytes();
+            readSubChunk();
         }
 
-        boolean hasNext() {
-            return (offset + cursor) < chunkLength;
+        // Return true if this ChunkReader has more bytes available, false otherwise.
+        // If this ChunkReader needs to read a new "sub-chunk", it does so in this method.
+        boolean ensureHasMoreRows() {
+            if (cursor >= subChunkSize) {
+                offset += cursor;
+                if (offset >= chunkLength) {
+                    return false;
+                }
+                readSubChunk();
+            }
+
+            return true;
         }
 
         byte getNext() {
-            if (cursor >= readBytes) {
-                readNextBytes();
-            }
             return bytes[cursor++];
         }
 
-        private void readNextBytes() {
+        private void readSubChunk() {
             try {
-                offset += readBytes;
                 synchronized (file) {
                     file.seek(chunkBegin + offset);
-                    readBytes = file.read(bytes);
+                    subChunkSize = file.read(bytes);
                 }
-                cursor = 0;
             }
             catch (IOException e) {
                 throw new RuntimeException(e);
             }
+
+            // Always "pretend" that we've read a few bytes less,
+            // so that we don't stop in the middle of reading a row
+            subChunkSize -= MAX_ROW_SIZE_BYTES;
+
+            cursor = 0;
         }
     }
 

From a6cd83fc9817de787591d27b1fe5d6527bb3aebd Mon Sep 17 00:00:00 2001
From: PanosDR <PanagiotisDrakatos@users.noreply.github.com>
Date: Sun, 28 Jan 2024 11:25:53 +0200
Subject: [PATCH 157/268] CalculateAverage_pdrakatos (#515)

* CalculateAverage_pdrakatos

* Rename to be valid with rules

* CalculateAverage_pdrakatos

* Rename to be valid with rules

* Changes on scripts execution

* Fixing bugs causing scripts not to be executed

* Changes on prepare make it compatible

* Fixing passing all tests

* Increase direct memory allocation buffer

* Fixing memory problem causes heap space exception
---
 calculate_average_PanagiotisDrakatos.sh       |  36 +++
 prepare_PanagiotisDrakatos.sh                 |  23 ++
 .../CalculateAverage_PanagiotisDrakatos.java  | 244 ++++++++++++++++++
 3 files changed, 303 insertions(+)
 create mode 100755 calculate_average_PanagiotisDrakatos.sh
 create mode 100755 prepare_PanagiotisDrakatos.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java

diff --git a/calculate_average_PanagiotisDrakatos.sh b/calculate_average_PanagiotisDrakatos.sh
new file mode 100755
index 000000000..e6c936578
--- /dev/null
+++ b/calculate_average_PanagiotisDrakatos.sh
@@ -0,0 +1,36 @@
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
+JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos
diff --git a/prepare_PanagiotisDrakatos.sh b/prepare_PanagiotisDrakatos.sh
new file mode 100755
index 000000000..c322486c9
--- /dev/null
+++ b/prepare_PanagiotisDrakatos.sh
@@ -0,0 +1,23 @@
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-graal 1>&2
+
+if [ ! -f target/CalculateAverage_PanagiotisDrakatos_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -R:MaxHeapSize=64m --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_PanagiotisDrakatos_image dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos
+fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java b/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
new file mode 100644
index 000000000..ecf2b700d
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
@@ -0,0 +1,244 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+public class CalculateAverage_PanagiotisDrakatos {
+    private static final String FILE = "./measurements.txt";
+    private static TreeMap<String, MeasurementObject> sortedCities;
+
+    public static void main(String[] args) throws IOException {
+        SeekableByteRead(FILE);
+        System.out.println(sortedCities);
+    }
+
+    private static void SeekableByteRead(String path) throws IOException {
+        FileInputStream fileInputStream = new FileInputStream(FILE);
+        FileChannel fileChannel = fileInputStream.getChannel();
+        Optional<Map<String, MeasurementObject>> optimistic = SplitSeekableByteChannel(fileChannel)
+                .parallel()
+                .map(CalculateAverage_PanagiotisDrakatos::MappingByteBufferToData)
+                .reduce(CalculateAverage_PanagiotisDrakatos::combineMaps);
+        fileChannel.close();
+        sortedCities = new TreeMap<>(optimistic.orElseThrow());
+
+    }
+
+    private static Stream<ByteBuffer> SplitSeekableByteChannel(FileChannel channel) throws IOException {
+        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new Iterator<ByteBuffer>() {
+            private static final long MAP_SIZE = 1024 * 1024 * 10L;
+
+            private long position = 0;
+            private long length = channel.size();
+
+            @Override
+            public boolean hasNext() {
+                while (position < length) {
+                    return true;
+                }
+                return false;
+            }
+
+            @Override
+            public ByteBuffer next() {
+                try {
+                    MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, position, Math.min(MAP_SIZE, length - position));
+                    int end = buffer.limit() - 1;
+                    while (buffer.get(end) != '\n') {
+                        end--;
+                    }
+                    position += end + 1;
+                    return buffer.slice(0, end);
+                }
+                catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        }, Spliterator.IMMUTABLE), false);
+    }
+
+    public static ByteBuffer concat(ByteBuffer[] buffers) {
+        int overAllCapacity = 0;
+        for (int i = 0; i < buffers.length; i++)
+            overAllCapacity += buffers[i].limit() - buffers[i].position();
+        overAllCapacity += buffers[0].limit() - buffers[0].position();
+        ByteBuffer all = ByteBuffer.allocate(overAllCapacity);
+        for (int i = 0; i < buffers.length; i++) {
+            ByteBuffer curr = buffers[i];
+            all.put(curr);
+        }
+
+        all.flip();
+        return all;
+    }
+
+    private static Map<String, MeasurementObject> combineMaps(Map<String, MeasurementObject> map1, Map<String, MeasurementObject> map2) {
+        for (var entry : map2.entrySet()) {
+            map1.merge(entry.getKey(), entry.getValue(), MeasurementObject::combine);
+        }
+
+        return map1;
+    }
+
+    private static Map<String, MeasurementObject> MappingByteBufferToData(ByteBuffer byteBuffer) {
+        Map<String, MeasurementObject> cities = new HashMap<>();
+        ByteBuffer bb = byteBuffer.duplicate();
+        int start = 0;
+        int end = 0;
+        while (start < bb.limit()) {
+            while (bb.get(end) != ';') {
+                end++;
+            }
+            int temp_counter = 0;
+            int temp_end = end;
+            try {
+                bb.position(end);
+                while (bb.get(temp_end) != '\n') {
+                    temp_counter++;
+                    temp_end++;
+                }
+            }
+            catch (IndexOutOfBoundsException e) {
+                temp_counter--;
+                temp_end--;
+            }
+            ByteBuffer city = bb.slice(start, end - start);
+            ByteBuffer temp = bb.slice(end + 1, temp_counter);
+            int tempPointer = 0;
+            int abs = 1;
+            if (temp.get(0) == '-') {
+                abs = -1;
+                tempPointer++;
+            }
+            int measuredValue;
+            if (temp.get(tempPointer + 1) == '.') {
+                measuredValue = abs * ((temp.get(tempPointer)) * 10 + (temp.get(tempPointer + 2)) - 528);
+            }
+            else {
+                measuredValue = abs * (temp.get(tempPointer) * 100 + temp.get(tempPointer + 1) * 10 + temp.get(tempPointer + 3) - 5328);
+            }
+
+            byte[] citybytes = new byte[city.limit()];
+            city.get(citybytes);
+            String cityName = new String(citybytes, StandardCharsets.UTF_8);
+
+            // update the map with the new measurement
+            MeasurementObject agg = cities.get(cityName);
+            if (agg == null) {
+                cities.put(cityName, new MeasurementObject(measuredValue, measuredValue, 0, 0).updateWith(measuredValue));
+            }
+            else {
+                cities.put(cityName, agg.updateWith(measuredValue));
+            }
+            start = temp_end + 1;
+            end = temp_end;
+        }
+        return cities;
+    }
+
+    private static final class MeasurementObject {
+
+        private int MAX;
+        private int MIN;
+
+        private long SUM;
+
+        private int REPEAT;
+
+        public MeasurementObject(int MAX, int MIN, long SUM, int REPEAT) {
+            this.MAX = MAX;
+            this.MIN = MIN;
+            this.SUM = SUM;
+            this.REPEAT = REPEAT;
+        }
+
+        public MeasurementObject() {
+        }
+
+        public MeasurementObject(int MAX, int MIN, long SUM) {
+            this.MAX = MAX;
+            this.MIN = MIN;
+            this.SUM = SUM;
+        }
+
+        public MeasurementObject(int MAX, int MIN) {
+            this.MAX = MAX;
+            this.MIN = MIN;
+        }
+
+        public static MeasurementObject combine(MeasurementObject m1, MeasurementObject m2) {
+            var mres = new MeasurementObject();
+            mres.MIN = MeasurementObject.min(m1.MIN, m2.MIN);
+            mres.MAX = MeasurementObject.max(m1.MAX, m2.MAX);
+            mres.SUM = m1.SUM + m2.SUM;
+            mres.REPEAT = m1.REPEAT + m2.REPEAT;
+            return mres;
+        }
+
+        public MeasurementObject updateWith(int measurement) {
+            MIN = MeasurementObject.min(MIN, measurement);
+            MAX = MeasurementObject.max(MAX, measurement);
+            SUM += measurement;
+            REPEAT++;
+            return this;
+        }
+
+        private static int max(final int a, final int b) {
+            final int diff = a - b;
+            final int dsgn = diff >> 31;
+            return a - (diff & dsgn);
+        }
+
+        private static int min(final int a, final int b) {
+            final int diff = a - b;
+            final int dsgn = diff >> 31;
+            return b + (diff & dsgn);
+        }
+
+        private double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o)
+                return true;
+            if (o == null || getClass() != o.getClass())
+                return false;
+            MeasurementObject that = (MeasurementObject) o;
+            return MAX == that.MAX && MIN == that.MIN && REPEAT == that.REPEAT;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(MAX, MIN, REPEAT);
+        }
+
+        @Override
+        public String toString() {
+            return round(MIN) + "/" + round((1.0 * SUM) / REPEAT) + "/" + round(MAX);
+        }
+    }
+}

From d9ab36a241e4f38e404b5fd5f92de86337dd459c Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Sun, 28 Jan 2024 11:34:28 +0100
Subject: [PATCH 158/268] jerrinot's improvement (#607)

* some random changes with minimal, if any, effect

* use munmap() trick
credit: thomaswue

* some smaller tweaks

* use native image
---
 calculate_average_jerrinot.sh                 |  10 +-
 prepare_jerrinot.sh                           |   9 +-
 .../onebrc/CalculateAverage_jerrinot.java     | 278 +++++++++++-------
 3 files changed, 183 insertions(+), 114 deletions(-)

diff --git a/calculate_average_jerrinot.sh b/calculate_average_jerrinot.sh
index 8de06c3d6..731172373 100755
--- a/calculate_average_jerrinot.sh
+++ b/calculate_average_jerrinot.sh
@@ -17,5 +17,11 @@
 
 # -XX:+UnlockDiagnosticVMOptions -XX:PrintAssemblyOptions=intel -XX:CompileCommand=print,*.CalculateAverage_mtopolnik::recordMeasurementAndAdvanceCursor"
 # -XX:InlineSmallCode=10000 -XX:-TieredCompilation -XX:CICompilerCount=2 -XX:CompileThreshold=1000\
-java -XX:+UseParallelGC  --enable-preview \
-  --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jerrinot
+if [ -f target/CalculateAverage_jerrinot_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_jerrinot_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_jerrinot_image
+else
+    JAVA_OPTS="--enable-preview"
+    echo "Choosing to run the app in JVM mode as no native image was found, use prepare_jerrinot.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jerrinot
+fi
diff --git a/prepare_jerrinot.sh b/prepare_jerrinot.sh
index f83a3ff69..c36cae32e 100755
--- a/prepare_jerrinot.sh
+++ b/prepare_jerrinot.sh
@@ -16,4 +16,11 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_jerrinot_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_jerrinot"
+    # Use -H:MethodFilter=CalculateAverage_jerrinot.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_jerrinot_image dev.morling.onebrc.CalculateAverage_jerrinot
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index 2492c0fbd..36e3182e2 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -18,6 +18,7 @@
 import sun.misc.Unsafe;
 
 import java.io.File;
+import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
@@ -54,9 +55,29 @@ private static Unsafe unsafe() {
     }
 
     public static void main(String[] args) throws Exception {
+        // credits for spawning new workers: thomaswue
+        if (args.length == 0 || !("--worker".equals(args[0]))) {
+            spawnWorker();
+            return;
+        }
         calculate();
     }
 
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .inheritIO()
+                .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
     static void calculate() throws Exception {
         final File file = new File(MEASUREMENTS_TXT);
         final long length = file.length();
@@ -140,6 +161,7 @@ private static void printResults(TreeMap<String, Processor.StationStats> accumul
         }
         sb.append('}');
         System.out.println(sb);
+        System.out.close();
     }
 
     public static int ceilPow2(int i) {
@@ -187,7 +209,7 @@ private static class Processor implements Runnable {
         private static final int SLOW_MAP_SIZE_BYTES = MAPS_SLOT_COUNT * SLOW_MAP_ENTRY_SIZE_BYTES;
         private static final int FAST_MAP_SIZE_BYTES = MAPS_SLOT_COUNT * FAST_MAP_ENTRY_SIZE_BYTES;
         private static final int SLOW_MAP_MAP_NAMES_BYTES = MAX_UNIQUE_KEYS * STATION_MAX_NAME_BYTES;
-        private static final long MAP_MASK = MAPS_SLOT_COUNT - 1;
+        private static final int MAP_MASK = MAPS_SLOT_COUNT - 1;
 
         private long slowMap;
         private long slowMapNamesPtr;
@@ -281,9 +303,9 @@ private void doTail() {
             doOne(cursorC, endC);
 
             transferToHeap();
-            UNSAFE.freeMemory(fastMap);
-            UNSAFE.freeMemory(slowMap);
-            UNSAFE.freeMemory(slowMapNamesLo);
+            // UNSAFE.freeMemory(fastMap);
+            // UNSAFE.freeMemory(slowMap);
+            // UNSAFE.freeMemory(slowMapNamesLo);
         }
 
         private void transferToHeap() {
@@ -339,11 +361,11 @@ private void doOne(long cursor, long endA) {
                 long mask = getDelimiterMask(currentWord);
                 long firstWordMask = ((mask - 1) ^ mask) >>> 8;
                 final long isMaskZeroA = ((mask | -mask) >>> 63) ^ 1;
-                long ext = -isMaskZeroA & 0xFF00_0000_0000_0000L;
+                long ext = -isMaskZeroA;
                 firstWordMask |= ext;
 
                 long maskedFirstWord = currentWord & firstWordMask;
-                long hash = hash(maskedFirstWord);
+                int hash = hash(maskedFirstWord);
                 while (mask == 0) {
                     cursor += 8;
                     currentWord = UNSAFE.getLong(cursor);
@@ -353,22 +375,22 @@ private void doOne(long cursor, long endA) {
                 final long semicolon = cursor + (delimiterByte >> 3);
                 final long maskedWord = currentWord & ((mask - 1) ^ mask) >>> 8;
 
-                long len = semicolon - start;
-                long baseEntryPtr = getOrCreateEntryBaseOffsetSlow(len, start, (int) hash, maskedWord);
+                int len = (int) (semicolon - start);
+                long baseEntryPtr = getOrCreateEntryBaseOffsetSlow(len, start, hash, maskedWord);
                 long temperatureWord = UNSAFE.getLong(semicolon + 1);
                 cursor = parseAndStoreTemperature(semicolon + 1, baseEntryPtr, temperatureWord);
             }
         }
 
-        private static long hash(long word1) {
+        private static int hash(long word) {
             // credit: mtopolnik
             long seed = 0x51_7c_c1_b7_27_22_0a_95L;
             int rotDist = 17;
-
-            long hash = word1;
+            //
+            long hash = word;
             hash *= seed;
             hash = Long.rotateLeft(hash, rotDist);
-            return hash;
+            return (int) hash;
         }
 
         @Override
@@ -382,69 +404,87 @@ public void run() {
             UNSAFE.setMemory(slowMapNamesPtr, SLOW_MAP_MAP_NAMES_BYTES, (byte) 0);
 
             while (cursorA < endA && cursorB < endB && cursorC < endC) {
+                long currentWordA = UNSAFE.getLong(cursorA);
+                long currentWordB = UNSAFE.getLong(cursorB);
+                long currentWordC = UNSAFE.getLong(cursorC);
+
                 long startA = cursorA;
                 long startB = cursorB;
                 long startC = cursorC;
 
-                long currentWordA = UNSAFE.getLong(startA);
-                long currentWordB = UNSAFE.getLong(startB);
-                long currentWordC = UNSAFE.getLong(startC);
-
                 long maskA = getDelimiterMask(currentWordA);
                 long maskB = getDelimiterMask(currentWordB);
                 long maskC = getDelimiterMask(currentWordC);
 
-                long firstWordMaskA = (maskA ^ (maskA - 1)) >>> 8;
-                long firstWordMaskB = (maskB ^ (maskB - 1)) >>> 8;
-                long firstWordMaskC = (maskC ^ (maskC - 1)) >>> 8;
-
-                final long isMaskZeroA = ((maskA | -maskA) >>> 63) ^ 1;
-                final long isMaskZeroB = ((maskB | -maskB) >>> 63) ^ 1;
-                final long isMaskZeroC = ((maskC | -maskC) >>> 63) ^ 1;
-
-                long extA = -isMaskZeroA & 0xFF00_0000_0000_0000L;
-                long extB = -isMaskZeroB & 0xFF00_0000_0000_0000L;
-                long extC = -isMaskZeroC & 0xFF00_0000_0000_0000L;
-
-                firstWordMaskA |= extA;
-                firstWordMaskB |= extB;
-                firstWordMaskC |= extC;
-
-                long maskedFirstWordA = currentWordA & firstWordMaskA;
-                long maskedFirstWordB = currentWordB & firstWordMaskB;
-                long maskedFirstWordC = currentWordC & firstWordMaskC;
-
-                // assertMasks(isMaskZeroA, maskA);
-
-                long hashA = hash(maskedFirstWordA);
-                long hashB = hash(maskedFirstWordB);
-                long hashC = hash(maskedFirstWordC);
-
-                cursorA += isMaskZeroA * 8;
-                cursorB += isMaskZeroB * 8;
-                cursorC += isMaskZeroC * 8;
-
-                currentWordA = UNSAFE.getLong(cursorA);
-                currentWordB = UNSAFE.getLong(cursorB);
-                currentWordC = UNSAFE.getLong(cursorC);
+                long maskComplementA = -maskA;
+                long maskComplementB = -maskB;
+                long maskComplementC = -maskC;
+
+                long maskWithDelimiterA = (maskA ^ (maskA - 1));
+                long maskWithDelimiterB = (maskB ^ (maskB - 1));
+                long maskWithDelimiterC = (maskC ^ (maskC - 1));
+
+                long isMaskZeroA = (((maskA | maskComplementA) >>> 63) ^ 1);
+                long isMaskZeroB = (((maskB | maskComplementB) >>> 63) ^ 1);
+                long isMaskZeroC = (((maskC | maskComplementC) >>> 63) ^ 1);
+
+                cursorA += isMaskZeroA << 3;
+                cursorB += isMaskZeroB << 3;
+                cursorC += isMaskZeroC << 3;
+
+                long nextWordA = UNSAFE.getLong(cursorA);
+                long nextWordB = UNSAFE.getLong(cursorB);
+                long nextWordC = UNSAFE.getLong(cursorC);
+
+                long firstWordMaskA = maskWithDelimiterA >>> 8;
+                long firstWordMaskB = maskWithDelimiterB >>> 8;
+                long firstWordMaskC = maskWithDelimiterC >>> 8;
+
+                long nextMaskA = getDelimiterMask(nextWordA);
+                long nextMaskB = getDelimiterMask(nextWordB);
+                long nextMaskC = getDelimiterMask(nextWordC);
+
+                boolean slowA = nextMaskA == 0;
+                boolean slowB = nextMaskB == 0;
+                boolean slowC = nextMaskC == 0;
+                boolean slowSome = (slowA || slowB || slowC);
+
+                long extA = -isMaskZeroA;
+                long extB = -isMaskZeroB;
+                long extC = -isMaskZeroC;
+
+                long maskedFirstWordA = (extA | firstWordMaskA) & currentWordA;
+                long maskedFirstWordB = (extB | firstWordMaskB) & currentWordB;
+                long maskedFirstWordC = (extC | firstWordMaskC) & currentWordC;
+
+                int hashA = hash(maskedFirstWordA);
+                int hashB = hash(maskedFirstWordB);
+                int hashC = hash(maskedFirstWordC);
+
+                currentWordA = nextWordA;
+                currentWordB = nextWordB;
+                currentWordC = nextWordC;
+
+                maskA = nextMaskA;
+                maskB = nextMaskB;
+                maskC = nextMaskC;
+                if (slowSome) {
+                    while (maskA == 0) {
+                        cursorA += 8;
+                        currentWordA = UNSAFE.getLong(cursorA);
+                        maskA = getDelimiterMask(currentWordA);
+                    }
 
-                maskA = getDelimiterMask(currentWordA);
-                while (maskA == 0) {
-                    cursorA += 8;
-                    currentWordA = UNSAFE.getLong(cursorA);
-                    maskA = getDelimiterMask(currentWordA);
-                }
-                maskB = getDelimiterMask(currentWordB);
-                while (maskB == 0) {
-                    cursorB += 8;
-                    currentWordB = UNSAFE.getLong(cursorB);
-                    maskB = getDelimiterMask(currentWordB);
-                }
-                maskC = getDelimiterMask(currentWordC);
-                while (maskC == 0) {
-                    cursorC += 8;
-                    currentWordC = UNSAFE.getLong(cursorC);
-                    maskC = getDelimiterMask(currentWordC);
+                    while (maskB == 0) {
+                        cursorB += 8;
+                        currentWordB = UNSAFE.getLong(cursorB);
+                        maskB = getDelimiterMask(currentWordB);
+                    }
+                    while (maskC == 0) {
+                        cursorC += 8;
+                        currentWordC = UNSAFE.getLong(cursorC);
+                        maskC = getDelimiterMask(currentWordC);
+                    }
                 }
 
                 final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
@@ -458,40 +498,57 @@ public void run() {
                 long digitStartA = semicolonA + 1;
                 long digitStartB = semicolonB + 1;
                 long digitStartC = semicolonC + 1;
+
                 long temperatureWordA = UNSAFE.getLong(digitStartA);
                 long temperatureWordB = UNSAFE.getLong(digitStartB);
                 long temperatureWordC = UNSAFE.getLong(digitStartC);
 
-                final long maskedWordA = currentWordA & ((maskA - 1) ^ maskA) >>> 8;
-                final long maskedWordB = currentWordB & ((maskB - 1) ^ maskB) >>> 8;
-                final long maskedWordC = currentWordC & ((maskC - 1) ^ maskC) >>> 8;
+                long lastWordMaskA = ((maskA - 1) ^ maskA) >>> 8;
+                long lastWordMaskB = ((maskB - 1) ^ maskB) >>> 8;
+                long lastWordMaskC = ((maskC - 1) ^ maskC) >>> 8;
 
-                long lenA = semicolonA - startA;
-                long lenB = semicolonB - startB;
-                long lenC = semicolonC - startC;
+                final long maskedLastWordA = currentWordA & lastWordMaskA;
+                final long maskedLastWordB = currentWordB & lastWordMaskB;
+                final long maskedLastWordC = currentWordC & lastWordMaskC;
 
-                long baseEntryPtrA;
-                if (lenA > 15) {
-                    baseEntryPtrA = getOrCreateEntryBaseOffsetSlow(lenA, startA, (int) hashA, maskedWordA);
-                }
-                else {
-                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(lenA, (int) hashA, maskedWordA, maskedFirstWordA);
-                }
+                int lenA = (int) (semicolonA - startA);
+                int lenB = (int) (semicolonB - startB);
+                int lenC = (int) (semicolonC - startC);
 
-                long baseEntryPtrB;
-                if (lenB > 15) {
-                    baseEntryPtrB = getOrCreateEntryBaseOffsetSlow(lenB, startB, (int) hashB, maskedWordB);
-                }
-                else {
-                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(lenB, (int) hashB, maskedWordB, maskedFirstWordB);
-                }
+                int mapIndexA = hashA & MAP_MASK;
+                int mapIndexB = hashB & MAP_MASK;
+                int mapIndexC = hashC & MAP_MASK;
 
+                long baseEntryPtrA;
+                long baseEntryPtrB;
                 long baseEntryPtrC;
-                if (lenC > 15) {
-                    baseEntryPtrC = getOrCreateEntryBaseOffsetSlow(lenC, startC, (int) hashC, maskedWordC);
+
+                if (slowSome) {
+                    if (slowA) {
+                        baseEntryPtrA = getOrCreateEntryBaseOffsetSlow(lenA, startA, hashA, maskedLastWordA);
+                    }
+                    else {
+                        baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA);
+                    }
+
+                    if (slowB) {
+                        baseEntryPtrB = getOrCreateEntryBaseOffsetSlow(lenB, startB, hashB, maskedLastWordB);
+                    }
+                    else {
+                        baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB);
+                    }
+
+                    if (slowC) {
+                        baseEntryPtrC = getOrCreateEntryBaseOffsetSlow(lenC, startC, hashC, maskedLastWordC);
+                    }
+                    else {
+                        baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC);
+                    }
                 }
                 else {
-                    baseEntryPtrC = getOrCreateEntryBaseOffsetFast(lenC, (int) hashC, maskedWordC, maskedFirstWordC);
+                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA);
+                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB);
+                    baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC);
                 }
 
                 cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
@@ -502,36 +559,35 @@ public void run() {
             // System.out.println("Longest chain: " + longestChain);
         }
 
-        private long getOrCreateEntryBaseOffsetFast(long lenLong, int hash, long maskedLastWord, long maskedFirstWord) {
-            int lenA = (int) lenLong;
-            long mapIndexA = hash & MAP_MASK;
+        private long getOrCreateEntryBaseOffsetFast(int mapIndexA, int lenA, long maskedLastWord, long maskedFirstWord) {
             for (;;) {
                 long basePtr = mapIndexA * FAST_MAP_ENTRY_SIZE_BYTES + fastMap;
+                long namePart1 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART1);
+                long namePart2 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART2);
+                if (namePart1 == maskedFirstWord && namePart2 == maskedLastWord) {
+                    return basePtr;
+                }
                 long lenPtr = basePtr + MAP_LEN_OFFSET;
                 int len = UNSAFE.getInt(lenPtr);
-                if (len == lenA) {
-                    long namePart1 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART1);
-                    long namePart2 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART2);
-                    if (namePart1 == maskedFirstWord && namePart2 == maskedLastWord) {
-                        return basePtr;
-                    }
-                }
-                else if (len == 0) {
-                    UNSAFE.putInt(lenPtr, lenA);
-                    // todo: this could be a single putLong()
-                    UNSAFE.putInt(basePtr + MAP_MAX_OFFSET, Integer.MIN_VALUE);
-                    UNSAFE.putInt(basePtr + MAP_MIN_OFFSET, Integer.MAX_VALUE);
-                    UNSAFE.putLong(basePtr + FAST_MAP_NAME_PART1, maskedFirstWord);
-                    UNSAFE.putLong(basePtr + FAST_MAP_NAME_PART2, maskedLastWord);
-                    return basePtr;
+                if (len == 0) {
+                    return newEntryFast(lenA, maskedLastWord, maskedFirstWord, lenPtr, basePtr);
                 }
                 mapIndexA = ++mapIndexA & MAP_MASK;
             }
         }
 
-        private long getOrCreateEntryBaseOffsetSlow(long lenLong, long startPtr, int hash, long maskedLastWord) {
-            long fullLen = lenLong & ~7L;
-            int lenA = (int) lenLong;
+        private static long newEntryFast(int lenA, long maskedLastWord, long maskedFirstWord, long lenPtr, long basePtr) {
+            UNSAFE.putInt(lenPtr, lenA);
+            // todo: this could be a single putLong()
+            UNSAFE.putInt(basePtr + MAP_MAX_OFFSET, Integer.MIN_VALUE);
+            UNSAFE.putInt(basePtr + MAP_MIN_OFFSET, Integer.MAX_VALUE);
+            UNSAFE.putLong(basePtr + FAST_MAP_NAME_PART1, maskedFirstWord);
+            UNSAFE.putLong(basePtr + FAST_MAP_NAME_PART2, maskedLastWord);
+            return basePtr;
+        }
+
+        private long getOrCreateEntryBaseOffsetSlow(int lenA, long startPtr, int hash, long maskedLastWord) {
+            long fullLen = lenA & ~7L;
             long mapIndexA = hash & MAP_MASK;
             for (;;) {
                 long basePtr = mapIndexA * SLOW_MAP_ENTRY_SIZE_BYTES + slowMap;
@@ -550,7 +606,7 @@ else if (len == 0) {
                     UNSAFE.putInt(basePtr + MAP_MAX_OFFSET, Integer.MIN_VALUE);
                     UNSAFE.putInt(basePtr + MAP_MIN_OFFSET, Integer.MAX_VALUE);
                     UNSAFE.copyMemory(startPtr, slowMapNamesPtr, lenA);
-                    long alignedLen = (lenLong & ~7L) + 8;
+                    long alignedLen = (lenA & ~7L) + 8;
                     slowMapNamesPtr += alignedLen;
                     return basePtr;
                 }

From 5bb6c5f3efa07f0142e168fe33e02adb4c7419ca Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 11:35:19 +0100
Subject: [PATCH 159/268] Leaderboard update

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ccae6c3ce..13cb89138 100644
--- a/README.md
+++ b/README.md
@@ -43,10 +43,11 @@ These are the results from running all entries into the challenge on eight cores
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
 | 2* | 00:02.146 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 2* | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 | 2* | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 | 3 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-|   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
+|   | 00:02.933 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
@@ -147,13 +148,14 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:20.691 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Kidlike.java)| 21.0.1-graal | [Kidlike](https://github.com/Kidlike) | GraalVM native binary |
 |   | 00:21.989 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_couragelee.java)| 21.0.1-open | [couragelee](https://github.com/couragelee) |  |
 |   | 00:22.188 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jgrateron.java)| 21.0.1-open | [Jairo Graterón](https://github.com/jgrateron) |  |
+|   | 00:22.334 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java)| 21.0.1-open | [Alberto Venturini](https://github.com/albertoventurini) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
 |   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
+|   | 00:24.550 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java)| 21.0.1-graal | [PanosDR](https://github.com/PanagiotisDrakatos) | GraalVM native binary |
 |   | 00:24.986 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
 |   | 00:29.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java)| 21.0.1-open | [Matteo Vaccari](https://github.com/xpmatteo) |  |
-|   | 00:30.635 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java)| 21.0.1-open | [Alberto Venturini](https://github.com/albertoventurini) |  |
 |   | 00:32.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_padreati.java)| 21.0.1-open | [Aurelian Tutuianu](https://github.com/padreati) |  |
 |   | 00:34.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_twobiers.java)| 21.0.1-tem | [Tobi](https://github.com/twobiers) |  |
 |   | 00:35.875 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MahmoudFawzyKhalil.java)| 21.0.1-open | [MahmoudFawzyKhalil](https://github.com/MahmoudFawzyKhalil) |  |

From 6bd2a21686718f1596c7ef01fe3313b4d419ec50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <sozal@catchpoint.com>
Date: Sun, 28 Jan 2024 13:56:30 +0300
Subject: [PATCH 160/268] serkan-ozal's 2nd submission with some minor
 improvements: (#612)

- use shared memory arena and region between worker threads
- reduce number of instructions slightly while processing file region
---
 calculate_average_serkan-ozal.sh              |   4 +-
 .../onebrc/CalculateAverage_serkan_ozal.java  | 127 ++++++++++--------
 2 files changed, 75 insertions(+), 56 deletions(-)

diff --git a/calculate_average_serkan-ozal.sh b/calculate_average_serkan-ozal.sh
index a903c1d39..857979b27 100755
--- a/calculate_average_serkan-ozal.sh
+++ b/calculate_average_serkan-ozal.sh
@@ -23,8 +23,10 @@ if [[ ! "$(uname -s)" = "Darwin" ]]; then
   JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
 fi
 
+CONFIGS="USE_SHARED_ARENA=true USE_SHARED_REGION=true CLOSE_STDOUT_ON_RESULT=true"
+
 #echo "Process started at $(date +%s%N | cut -b1-13)"
-eval "exec 3< <({ CLOSE_STDOUT_ON_RESULT=true USE_SHARED_ARENA=true java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_serkan_ozal; })"
+eval "exec 3< <({ $CONFIGS java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_serkan_ozal; })"
 read <&3 result
 echo -e "$result"
 #echo "Process finished at $(date +%s%N | cut -b1-13)"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
index b02538358..8087919a6 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -69,9 +69,10 @@ public class CalculateAverage_serkan_ozal {
     private static final boolean USE_VTHREADS = getBooleanConfig("USE_VTHREADS", false);
     private static final int VTHREAD_COUNT = getIntegerConfig("VTHREAD_COUNT", 1024);
     private static final int REGION_COUNT = getIntegerConfig("REGION_COUNT", -1);
-    private static final boolean USE_SHARED_ARENA = getBooleanConfig("USE_SHARED_ARENA", false);
+    private static final boolean USE_SHARED_ARENA = getBooleanConfig("USE_SHARED_ARENA", true);
+    private static final boolean USE_SHARED_REGION = getBooleanConfig("USE_SHARED_REGION", true);
     private static final int MAP_CAPACITY = getIntegerConfig("MAP_CAPACITY", 1 << 17);
-    private static final boolean CLOSE_STDOUT_ON_RESULT = getBooleanConfig("CLOSE_STDOUT_ON_RESULT", false);
+    private static final boolean CLOSE_STDOUT_ON_RESULT = getBooleanConfig("CLOSE_STDOUT_ON_RESULT", true);
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     // My dear old friend Unsafe
@@ -118,7 +119,11 @@ public static void main(String[] args) throws Exception {
             ExecutorService executor = USE_VTHREADS
                     ? Executors.newVirtualThreadPerTaskExecutor()
                     : Executors.newFixedThreadPool(concurrency, new RegionProcessorThreadFactory());
-
+            MemorySegment region = null;
+            if (USE_SHARED_REGION) {
+                arena = Arena.ofShared();
+                region = fc.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, arena);
+            }
             // Split whole file into regions and start region processors to handle those regions
             List<Future<Response>> futures = new ArrayList<>(regionCount);
             for (int i = 0; i < regionCount; i++) {
@@ -128,7 +133,7 @@ public static void main(String[] args) throws Exception {
                 long closestLineEndPos = (i < regionCount - 1)
                         ? findClosestLineEnd(fc, endPos, lineBuffer)
                         : fileSize;
-                Request request = new Request(fc, arena, startPos, closestLineEndPos, result);
+                Request request = new Request(fc, arena, region, startPos, closestLineEndPos, result);
                 RegionProcessor regionProcessor = createRegionProcessor(request);
                 Future<Response> future = executor.submit(regionProcessor);
                 futures.add(future);
@@ -230,19 +235,20 @@ private static class RegionProcessor implements Callable<Response> {
 
         private final FileChannel fc;
         private final Arena arena;
+        private final MemorySegment region;
         private final long start;
         private final long end;
         private final long size;
-        private final OpenMap map;
         private final Result result;
+        private OpenMap map;
 
         private RegionProcessor(Request request) {
             this.fc = request.fileChannel;
             this.arena = request.arena;
+            this.region = request.region;
             this.start = request.start;
             this.end = request.end;
             this.size = end - start;
-            this.map = new OpenMap();
             this.result = request.result;
         }
 
@@ -263,13 +269,21 @@ public Response call() throws Exception {
         }
 
         private void processRegion() throws Exception {
+            // Create map in its own thread
+            this.map = new OpenMap();
+
             boolean arenaGiven = arena != null;
             // If no shared global memory arena is used, create and use its own local memory arena
             Arena a = arenaGiven ? arena : Arena.ofConfined();
             try {
-                MemorySegment region = fc.map(FileChannel.MapMode.READ_ONLY, start, size, a);
-
-                doProcessRegion(region);
+                boolean regionGiven = region != null;
+                MemorySegment r = regionGiven
+                        ? region
+                        : fc.map(FileChannel.MapMode.READ_ONLY, start, size, a);
+                long regionStart = regionGiven ? (r.address() + start) : r.address();
+                long regionEnd = regionStart + size;
+
+                doProcessRegion(r, r.address(), regionStart, regionEnd);
                 if (VERBOSE) {
                     System.out.println("[Processor-" + Thread.currentThread().getName() + "] Region processed at " + System.currentTimeMillis());
                 }
@@ -311,25 +325,23 @@ private void processRegion() throws Exception {
             }
         }
 
-        private void doProcessRegion(MemorySegment region) {
-            final long regionAddress = region.address();
-            final long regionSize = region.byteSize();
+        private void doProcessRegion(MemorySegment region, long regionAddress, long regionStart, long regionEnd) {
             final int vectorSize = BYTE_SPECIES.vectorByteSize();
-            final long regionMainLimit = regionSize - MAX_LINE_LENGTH;
+            final long regionMainLimit = regionEnd - MAX_LINE_LENGTH;
 
-            int regionPtr;
+            long regionPtr;
 
             // Read and process region - main
-            for (regionPtr = 0; regionPtr < regionMainLimit;) {
-                regionPtr = doProcessLine(region, regionAddress, vectorSize, regionPtr);
+            for (regionPtr = regionStart; regionPtr < regionMainLimit;) {
+                regionPtr = doProcessLine(region, regionAddress, regionPtr, vectorSize);
             }
 
             // Read and process region - tail
-            for (int i = regionPtr, j = regionPtr; i < regionSize;) {
-                byte b = U.getByte(regionAddress + i);
+            for (long i = regionPtr, j = regionPtr; i < regionEnd;) {
+                byte b = U.getByte(i);
                 if (b == KEY_VALUE_SEPARATOR) {
-                    long baseOffset = map.putKey(null, regionAddress, j, i - j);
-                    i = extractValue(regionAddress, i + 1, map, baseOffset);
+                    long baseOffset = map.putKey(null, j, (int) (i - j));
+                    i = extractValue(i + 1, map, baseOffset);
                     j = i;
                 }
                 else {
@@ -338,42 +350,41 @@ private void doProcessRegion(MemorySegment region) {
             }
         }
 
-        private int doProcessLine(MemorySegment region, long regionAddress, int vectorSize, int i) {
+        private long doProcessLine(MemorySegment region, long regionAddress, long regionPtr, int vectorSize) {
             // Find key/value separator
             ////////////////////////////////////////////////////////////////////////////////////////////////////////
-            int keyStartIdx = i;
+            long keyStartPtr = regionPtr;
 
             // Vectorized search for key/value separator
-            ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, region, i, NATIVE_BYTE_ORDER);
+            ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, region, regionPtr - regionAddress, NATIVE_BYTE_ORDER);
             int keyValueSepOffset = keyVector.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
             // Check whether key/value separator is found in the first vector (city name is <= vector size)
             if (keyValueSepOffset == vectorSize) {
-                i += vectorSize;
+                regionPtr += vectorSize;
                 keyValueSepOffset = 0;
-                for (; U.getByte(regionAddress + i) != KEY_VALUE_SEPARATOR; i++)
+                for (; U.getByte(regionPtr) != KEY_VALUE_SEPARATOR; regionPtr++)
                     ;
                 // I have tried vectorized search for key/value separator in the remaining part,
                 // but since majority (99%) of the city names <= 16 bytes
                 // and other a few longer city names (have length < 16 and <= 32) not close to 32 bytes,
                 // byte by byte search is better in terms of performance (according to my experiments) and simplicity.
             }
-            i += keyValueSepOffset;
-            int keyLength = i - keyStartIdx;
-            i++;
+            regionPtr += keyValueSepOffset;
+            int keyLength = (int) (regionPtr - keyStartPtr);
+            regionPtr++;
             ////////////////////////////////////////////////////////////////////////////////////////////////////////
 
             // Put key and get map offset to put value
-            long baseOffset = map.putKey(keyVector, regionAddress, keyStartIdx, keyLength);
+            long entryOffset = map.putKey(keyVector, keyStartPtr, keyLength);
 
             // Extract value, put it into map and return next position in the region to continue processing from there
-            return extractValue(regionAddress, i, map, baseOffset);
+            return extractValue(regionPtr, map, entryOffset);
         }
-
     }
 
     // Credits: merykitty
-    private static int extractValue(long regionAddress, int idx, OpenMap map, long baseOffset) {
-        long word = U.getLong(regionAddress + idx);
+    private static long extractValue(long regionPtr, OpenMap map, long entryOffset) {
+        long word = U.getLong(regionPtr);
         if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
             word = Long.reverseBytes(word);
         }
@@ -388,10 +399,10 @@ private static int extractValue(long regionAddress, int idx, OpenMap map, long b
         int value = (int) ((absValue ^ signed) - signed);
 
         // Put extracted value into map
-        map.putValue(baseOffset, value);
+        map.putValue(entryOffset, value);
 
         // Return new position
-        return idx + (decimalSepPos >>> 3) + 3;
+        return regionPtr + (decimalSepPos >>> 3) + 3;
     }
 
     /**
@@ -401,13 +412,16 @@ private static final class Request {
 
         private final FileChannel fileChannel;
         private final Arena arena;
+        private final MemorySegment region;
         private final long start;
         private final long end;
         private final Result result;
 
-        private Request(FileChannel fileChannel, Arena arena, long start, long end, Result result) {
+        private Request(FileChannel fileChannel, Arena arena, MemorySegment region,
+                        long start, long end, Result result) {
             this.fileChannel = fileChannel;
             this.arena = arena;
+            this.region = region;
             this.start = start;
             this.end = end;
             this.result = result;
@@ -555,8 +569,7 @@ private static int calculateKeyHash(long address, int keyLength) {
             return (Integer.rotateLeft(x * seed, rotate) ^ y) * seed;
         }
 
-        private long putKey(ByteVector keyVector, long regionAddress, long keyStartIdx, int keyLength) {
-            long keyStartAddress = regionAddress + keyStartIdx;
+        private long putKey(ByteVector keyVector, long keyStartAddress, int keyLength) {
             // Calculate hash of key
             int keyHash = calculateKeyHash(keyStartAddress, keyLength);
             // and get the position of the entry in the linear map based on calculated hash
@@ -565,23 +578,23 @@ private long putKey(ByteVector keyVector, long regionAddress, long keyStartIdx,
             // Start searching from the calculated position
             // and continue until find an available slot in case of hash collision
             // TODO Prevent infinite loop if all the slots are in use for other keys
-            for (long baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + (idx * ENTRY_SIZE);; baseOffset = (baseOffset + ENTRY_SIZE) & ENTRY_MASK) {
-                int keyStartOffset = (int) baseOffset + KEY_OFFSET;
-                int keySize = U.getInt(data, baseOffset + KEY_SIZE_OFFSET);
+            for (long entryOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + (idx * ENTRY_SIZE);; entryOffset = (entryOffset + ENTRY_SIZE) & ENTRY_MASK) {
+                int keyStartOffset = (int) entryOffset + KEY_OFFSET;
+                int keySize = U.getInt(data, entryOffset + KEY_SIZE_OFFSET);
                 // Check whether current index is empty (no another key is inserted yet)
                 if (keySize == 0) {
                     // Initialize entry slot for new key
-                    U.putShort(data, baseOffset + MIN_VALUE_OFFSET, Short.MAX_VALUE);
-                    U.putShort(data, baseOffset + MAX_VALUE_OFFSET, Short.MIN_VALUE);
-                    U.putInt(data, baseOffset + KEY_SIZE_OFFSET, keyLength);
+                    U.putShort(data, entryOffset + MIN_VALUE_OFFSET, Short.MAX_VALUE);
+                    U.putShort(data, entryOffset + MAX_VALUE_OFFSET, Short.MIN_VALUE);
+                    U.putInt(data, entryOffset + KEY_SIZE_OFFSET, keyLength);
                     U.copyMemory(null, keyStartAddress, data, keyStartOffset, keyLength);
-                    return baseOffset;
+                    return entryOffset;
                 }
                 // Check for hash collision (hashes are same, but keys are different).
                 // If there is no collision (both hashes and keys are equals), return current slot's offset.
                 // Otherwise, continue iterating until find an available slot.
                 if (keySize == keyLength && keysEqual(keyVector, keyStartAddress, keyLength, keyStartOffset)) {
-                    return baseOffset;
+                    return entryOffset;
                 }
             }
         }
@@ -633,15 +646,19 @@ private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLen
             return wordA == wordB;
         }
 
-        private void putValue(long baseOffset, int value) {
-            U.putInt(data, baseOffset + COUNT_OFFSET,
-                    U.getInt(data, baseOffset + COUNT_OFFSET) + 1);
-            U.putShort(data, baseOffset + MIN_VALUE_OFFSET,
-                    (short) Math.min(value, U.getShort(data, baseOffset + MIN_VALUE_OFFSET)));
-            U.putShort(data, baseOffset + MAX_VALUE_OFFSET,
-                    (short) Math.max(value, U.getShort(data, baseOffset + MAX_VALUE_OFFSET)));
-            U.putLong(data, baseOffset + VALUE_SUM_OFFSET,
-                    value + U.getLong(data, baseOffset + VALUE_SUM_OFFSET));
+        private void putValue(long entryOffset, int value) {
+            long countOffset = entryOffset + COUNT_OFFSET;
+            U.putInt(data, countOffset, U.getInt(data, countOffset) + 1);
+            long minValueOffset = entryOffset + MIN_VALUE_OFFSET;
+            if (value < U.getShort(data, minValueOffset)) {
+                U.putShort(data, minValueOffset, (short) value);
+            }
+            long maxValueOffset = entryOffset + MAX_VALUE_OFFSET;
+            if (value > U.getShort(data, maxValueOffset)) {
+                U.putShort(data, maxValueOffset, (short) value);
+            }
+            long sumOffset = entryOffset + VALUE_SUM_OFFSET;
+            U.putLong(data, sumOffset, U.getLong(data, sumOffset) + value);
         }
 
         private void merge(Map<String, KeyResult> resultMap) {

From 5f467c668a4177b3bdb0590f4d16ae8dc179c019 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 11:56:44 +0100
Subject: [PATCH 161/268] Leaderboard update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 13cb89138..6c4a337d7 100644
--- a/README.md
+++ b/README.md
@@ -46,8 +46,8 @@ These are the results from running all entries into the challenge on eight cores
 | 2* | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 | 2* | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 | 3 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+|   | 00:02.512 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
-|   | 00:02.933 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |

From 243f34f38bac2710978fa649aec681766128df6e Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 16:25:04 +0100
Subject: [PATCH 162/268] Adding 10K eval script

---
 evaluate_10K.sh | 324 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100755 evaluate_10K.sh

diff --git a/evaluate_10K.sh b/evaluate_10K.sh
new file mode 100755
index 000000000..6847d279b
--- /dev/null
+++ b/evaluate_10K.sh
@@ -0,0 +1,324 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+set -eo pipefail
+
+if [ -z "$1" ]
+  then
+    echo "Usage: evaluate.sh <fork name> (<fork name 2> ...)"
+    echo " for each fork, there must be a 'calculate_average_<fork name>.sh' script and an optional 'prepare_<fork name>.sh'."
+    exit 1
+fi
+
+BOLD_WHITE='\033[1;37m'
+CYAN='\033[0;36m'
+GREEN='\033[0;32m'
+PURPLE='\033[0;35m'
+BOLD_RED='\033[1;31m'
+RED='\033[0;31m'
+BOLD_YELLOW='\033[1;33m'
+RESET='\033[0m' # No Color
+
+MEASUREMENTS_FILE="measurements_10K_1B.txt"
+RUNS=5
+DEFAULT_JAVA_VERSION="21.0.1-open"
+: "${BUILD_JAVA_VERSION:=21.0.1-open}"
+RUN_TIME_LIMIT=300 # seconds
+
+TIMEOUT=""
+if [ "$(uname -s)" == "Linux" ]; then
+  TIMEOUT="timeout -v $RUN_TIME_LIMIT"
+else # MacOs
+  if [ -x "$(command -v gtimeout)" ]; then
+    TIMEOUT="gtimeout -v $RUN_TIME_LIMIT" # from `brew install coreutils`
+  else
+    echo -e "${BOLD_YELLOW}WARNING${RESET} gtimeout not available, benchmark runs may take indefinitely long."
+  fi
+fi
+
+function check_command_installed {
+  if ! [ -x "$(command -v $1)" ]; then
+    echo "Error: $1 is not installed." >&2
+    exit 1
+  fi
+}
+
+function print_and_execute() {
+  echo "+ $@" >&2
+  "$@"
+}
+
+check_command_installed java
+check_command_installed hyperfine
+check_command_installed jq
+check_command_installed bc
+
+# Validate that ./calculate_average_<fork>.sh exists for each fork
+for fork in "$@"; do
+  if [ ! -f "./calculate_average_$fork.sh" ]; then
+    echo -e "${BOLD_RED}ERROR${RESET}: ./calculate_average_$fork.sh does not exist." >&2
+    exit 1
+  fi
+done
+
+## SDKMAN Setup
+# 1. Custom check for sdkman installed; not sure why check_command_installed doesn't detect it properly
+if [ ! -f "$HOME/.sdkman/bin/sdkman-init.sh" ]; then
+     echo -e "${BOLD_RED}ERROR${RESET}: sdkman is not installed." >&2
+    exit 1
+fi
+
+# 2. Init sdkman in this script
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+
+# 3. make sure the default java version is installed
+if [ ! -d "$HOME/.sdkman/candidates/java/$DEFAULT_JAVA_VERSION" ]; then
+  print_and_execute sdk install java $DEFAULT_JAVA_VERSION
+fi
+
+# 4. Install missing SDK java versions in any of the prepare_*.sh scripts for the provided forks
+for fork in "$@"; do
+  if [ -f "./prepare_$fork.sh" ]; then
+    grep -h "^sdk use" "./prepare_$fork.sh" | cut -d' ' -f4 | while read -r version; do
+      if [ ! -d "$HOME/.sdkman/candidates/java/$version" ]; then
+        print_and_execute sdk install java $version
+      fi
+    done || true # grep returns exit code 1 when no match, `|| true` prevents the script from exiting early
+  fi
+done
+## END - SDKMAN Setup
+
+# Check if SMT is enabled (we want it disabled)
+if [ -f "/sys/devices/system/cpu/smt/active" ]; then
+  if [ "$(cat /sys/devices/system/cpu/smt/active)" != "0" ]; then
+    echo -e "${BOLD_YELLOW}WARNING${RESET} SMT is enabled"
+  fi
+fi
+
+# Check if Turbo Boost is enabled (we want it disabled)
+if [ -f "/sys/devices/system/cpu/cpufreq/boost" ]; then
+  if [ "$(cat /sys/devices/system/cpu/cpufreq/boost)" != "0" ]; then
+    echo -e "${BOLD_YELLOW}WARNING${RESET} Turbo Boost is enabled"
+  fi
+fi
+
+print_and_execute sdk use java $BUILD_JAVA_VERSION
+print_and_execute java --version
+# print_and_execute ./mvnw --quiet clean verify
+
+print_and_execute rm -f measurements.txt
+print_and_execute ln -s $MEASUREMENTS_FILE measurements.txt
+
+echo ""
+
+# check if measurements_xxx.out exists
+if [ ! -f "${MEASUREMENTS_FILE%.txt}.out" ]; then
+  echo -e "${BOLD_RED}ERROR${RESET}: ${MEASUREMENTS_FILE%.txt}.out does not exist." >&2
+  echo "Please create it with:"
+  echo ""
+  echo "  ./calculate_average_baseline.sh > ${MEASUREMENTS_FILE%.txt}.out"
+  echo ""
+  exit 1
+fi
+
+# Run tests and benchmark for each fork
+filetimestamp=$(date  +"%Y%m%d%H%M%S") # same for all fork.out files from this run
+failed=()
+for fork in "$@"; do
+  set +e # we don't want prepare.sh, test.sh or hyperfine failing on 1 fork to exit the script early
+
+  # Run prepare script
+  if [ -f "./prepare_$fork.sh" ]; then
+    print_and_execute source "./prepare_$fork.sh"
+  else
+    print_and_execute sdk use java $DEFAULT_JAVA_VERSION
+  fi
+
+  # Run the test suite
+  print_and_execute $TIMEOUT ./test.sh $fork
+  if [ $? -ne 0 ]; then
+    failed+=("$fork")
+    echo ""
+    echo -e "${BOLD_RED}FAILURE${RESET}: ./test.sh $fork failed"
+
+    continue
+  fi
+  echo ""
+
+  # Run the test on $MEASUREMENTS_FILE; this serves as the warmup
+  print_and_execute $TIMEOUT ./test.sh $fork $MEASUREMENTS_FILE
+  if [ $? -ne 0 ]; then
+    failed+=("$fork")
+    echo ""
+    echo -e "${BOLD_RED}FAILURE${RESET}: ./test.sh $fork $MEASUREMENTS_FILE failed"
+
+    continue
+  fi
+  echo ""
+
+  # re-link measurements.txt since test.sh deleted it
+  print_and_execute rm -f measurements.txt
+  print_and_execute ln -s $MEASUREMENTS_FILE measurements.txt
+
+  # Use hyperfine to run the benchmark for each fork
+  HYPERFINE_OPTS="--warmup 0 --runs $RUNS --export-json $fork-$filetimestamp-timing.json --output ./$fork-$filetimestamp.out"
+
+  # check if this script is running on a Linux box
+  if [ "$(uname -s)" == "Linux" ]; then
+    check_command_installed numactl
+
+    # Linux platform
+    # prepend this with numactl --physcpubind=0-7 for running it only with 8 cores
+    numactl --physcpubind=0-7 hyperfine $HYPERFINE_OPTS "$TIMEOUT ./calculate_average_$fork.sh 2>&1"
+  else # MacOS
+    hyperfine $HYPERFINE_OPTS "$TIMEOUT ./calculate_average_$fork.sh 2>&1"
+  fi
+  # Catch hyperfine command failed
+  if [ $? -ne 0 ]; then
+    failed+=("$fork")
+    # Hyperfine already prints the error message
+    echo ""
+    continue
+  fi
+done
+set -e
+
+# Summary
+echo -e "${BOLD_WHITE}Summary${RESET}"
+for fork in "$@"; do
+  # skip reporting results for failed forks
+  if [[ " ${failed[@]} " =~ " ${fork} " ]]; then
+    echo -e "  ${RED}$fork${RESET}: command failed or output did not match"
+    continue
+  fi
+
+  # Trimmed mean = The slowest and the fastest runs are discarded, the
+  # mean value of the remaining three runs is the result for that contender
+  trimmed_mean=$(jq -r '.results[0].times | sort_by(.|tonumber) | .[1:-1] | add / length' $fork-$filetimestamp-timing.json)
+  raw_times=$(jq -r '.results[0].times | join(",")' $fork-$filetimestamp-timing.json)
+
+  if [ "$fork" == "$1" ]; then
+    color=$CYAN
+  elif [ "$fork" == "$2" ]; then
+    color=$GREEN
+  else
+    color=$PURPLE
+  fi
+
+  echo -e "  ${color}$fork${RESET}: trimmed mean ${BOLD_WHITE}$trimmed_mean${RESET}, raw times ${BOLD_WHITE}$raw_times${RESET}"
+done
+echo ""
+
+## Leaderboard - prints the leaderboard in Markdown table format
+echo -e "${BOLD_WHITE}Leaderboard${RESET}"
+
+# 1. Create a temp file to store the leaderboard entries
+leaderboard_temp_file=$(mktemp)
+
+# 2. Process each fork and append the 1-line entry to the temp file
+for fork in "$@"; do
+  # skip reporting results for failed forks
+  if [[ " ${failed[@]} " =~ " ${fork} " ]]; then
+    continue
+  fi
+
+  trimmed_mean=$(jq -r '.results[0].times | sort_by(.|tonumber) | .[1:-1] | add / length' $fork-$filetimestamp-timing.json)
+
+  # trimmed_mean is in seconds
+  # Format trimmed_mean as MM::SS.mmm
+  # using bc
+  trimmed_mean_minutes=$(echo "$trimmed_mean / 60" | bc)
+  trimmed_mean_seconds=$(echo "$trimmed_mean % 60 / 1" | bc)
+  trimmed_mean_ms=$(echo "($trimmed_mean - $trimmed_mean_minutes * 60 - $trimmed_mean_seconds) * 1000 / 1" | bc)
+  trimmed_mean_formatted=$(printf "%02d:%02d.%03d" $trimmed_mean_minutes $trimmed_mean_seconds $trimmed_mean_ms)
+
+  # Get Github user's name from public Github API (rate limited after ~50 calls, so results are cached in github_users.txt)
+  set +e
+  github_user__name=$(grep "^$fork;" github_users.txt | cut -d ';' -f2)
+  if [ -z "$github_user__name" ]; then
+    github_user__name=$(curl -s https://api.github.com/users/$fork | jq -r '.name' | tr -d '"')
+    if [ "$github_user__name" != "null" ]; then
+      echo "$fork;$github_user__name" >> github_users.txt
+    else
+      github_user__name=$fork
+    fi
+  fi
+  set -e
+
+  # Read java version from prepare_$fork.sh if it exists, otherwise assume 21.0.1-open
+  java_version="21.0.1-open"
+  # Hard-coding the note message for now
+  notes=""
+  if [ -f "./prepare_$fork.sh" ]; then
+    java_version=$(grep -F "sdk use java" ./prepare_$fork.sh | cut -d' ' -f4)
+
+    if grep -F "native-image" -q ./prepare_$fork.sh ; then
+      notes="GraalVM native binary"
+    fi
+  fi
+
+  # check if Java source file uses Unsafe
+  if grep -F "theUnsafe" -q ./src/main/java*/dev/morling/onebrc/CalculateAverage_$fork.java ; then
+    # if notes is not empty, append a comma and space before the unsafe note
+    notes="${notes:+$notes, }uses Unsafe"
+  fi
+
+  echo -n "$trimmed_mean;" >> $leaderboard_temp_file # for sorting
+  echo -n "| # " >> $leaderboard_temp_file
+  echo -n "| $trimmed_mean_formatted " >> $leaderboard_temp_file
+  echo -n "| [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_$fork.java)" >> $leaderboard_temp_file
+  echo -n "| $java_version " >> $leaderboard_temp_file
+  echo -n "| [$github_user__name](https://github.com/$fork) " >> $leaderboard_temp_file
+  echo -n "| $notes " >> $leaderboard_temp_file
+  echo "|" >> $leaderboard_temp_file
+done
+
+# 3. Sort leaderboard_temp_file by trimmed_mean and remove the sorting column
+sort -n $leaderboard_temp_file | cut -d ';' -f 2 > $leaderboard_temp_file.sorted
+
+# 4. Print the leaderboard
+echo ""
+echo "| # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |"
+echo "|---|-----------------|--------------------|-----|---------------|-----------|"
+# If $leaderboard_temp_file.sorted has more than 3 entires, include rankings
+if [ $(wc -l < $leaderboard_temp_file.sorted) -gt 3 ]; then
+  head -n 1 $leaderboard_temp_file.sorted | tr '#' 1
+  head -n 2 $leaderboard_temp_file.sorted | tail -n 1 | tr '#' 2
+  head -n 3 $leaderboard_temp_file.sorted | tail -n 1 | tr '#' 3
+  tail -n+4 $leaderboard_temp_file.sorted | tr '#' ' '
+else
+  # Don't show rankings
+  cat $leaderboard_temp_file.sorted | tr '#' ' '
+fi
+echo ""
+
+# 5. Cleanup
+rm $leaderboard_temp_file
+## END - Leaderboard
+
+# Finalize .out files
+echo "Raw results saved to file(s):"
+for fork in "$@"; do
+  if [ -f "$fork-$filetimestamp-timing.json" ]; then
+      cat $fork-$filetimestamp-timing.json >> $fork-$filetimestamp.out
+      rm $fork-$filetimestamp-timing.json
+  fi
+
+  if [ -f "$fork-$filetimestamp.out" ]; then
+    echo "  $fork-$filetimestamp.out"
+  fi
+done

From 8ef22ab1bd055775ee1d1b63295feb35600741ce Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan-aotearoa@users.noreply.github.com>
Date: Sun, 28 Jan 2024 15:30:22 +0000
Subject: [PATCH 163/268] Initial submission for jonathan_aotearoa. (#586)

* Initial submission for jonathan_aotearoa

* Fixing typos

* Adding hyphens to prepare and calculate shell scripts so that they're aligned with my GitHub username.

* Making chunk processing more robust in attempt to fix the cause of the build error.

* Fixing typo.

* Fixed the handling of files less than 8 bytes in length.

* Additional assertion, comment improvements.

* Refactoring to improve testability. Additional assertion and comments.

* Updating collision checking to include checking if the station name is equal.

* Minor refactoring to make param ordering consistent.

* Adding a custom toString method for the results map.

* Fixing collision checking bug

* Fixing rounding bug.

* Fixing collision bug.

---------

Co-authored-by: jonathan <jonathan@example.com>
---
 calculate_average_jonathan-aotearoa.sh        |  27 +
 prepare_jonathan-aotearoa.sh                  |  28 +
 .../CalculateAverage_jonathanaotearoa.java    | 587 ++++++++++++++++++
 3 files changed, 642 insertions(+)
 create mode 100755 calculate_average_jonathan-aotearoa.sh
 create mode 100755 prepare_jonathan-aotearoa.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java

diff --git a/calculate_average_jonathan-aotearoa.sh b/calculate_average_jonathan-aotearoa.sh
new file mode 100755
index 000000000..4375c3ca4
--- /dev/null
+++ b/calculate_average_jonathan-aotearoa.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+if [ -f target/CalculateAverage_jonathan-aotearoa_image ]; then
+    echo "Using native image 'target/CalculateAverage_jonathan-aotearoa_image'. Delete this file to select JVM mode." 1>&2
+    target/CalculateAverage_jonathan-aotearoa_image
+else
+    JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
+    JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+    echo "Running in JVM mode as no native image was found. Run 'prepare_jonathan-aotearoa.sh' to generate a native image." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_jonathanaotearoa
+fi
+
diff --git a/prepare_jonathan-aotearoa.sh b/prepare_jonathan-aotearoa.sh
new file mode 100755
index 000000000..bcf76acfa
--- /dev/null
+++ b/prepare_jonathan-aotearoa.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_jonathan-aotearoa_image ]; then
+    # Enable preview features and disable system assertions.
+    JAVA_OPTS="--enable-preview -dsa"
+    # Use the no-op GC.
+    # Enable CPU features (-march=native) and level-3 optimisations (-O3)
+    NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_jonathanaotearoa --gc=epsilon -O3 -march=native --strict-image-heap $JAVA_OPTS"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_jonathan-aotearoa_image dev.morling.onebrc.CalculateAverage_jonathanaotearoa
+fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java
new file mode 100644
index 000000000..cd626347b
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java
@@ -0,0 +1,587 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.ForkJoinPool;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class CalculateAverage_jonathanaotearoa {
+
+    public static final Unsafe UNSAFE;
+
+    static {
+        try {
+            final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            UNSAFE = (Unsafe) theUnsafe.get(null);
+        } catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(STR."Error getting instance of \{Unsafe.class.getName()}");
+        }
+    }
+
+    private static final int WORD_BYTES = Long.BYTES;
+    private static final Path FILE_PATH = Path.of("./measurements.txt");
+    private static final Path SAMPLE_DIR_PATH = Path.of("./src/test/resources/samples");
+    private static final byte MAX_LINE_BYTES = 107;
+    private static final byte NEW_LINE_BYTE = '\n';
+    private static final long SEPARATOR_XOR_MASK = 0x3b3b3b3b3b3b3b3bL;
+
+    // A mask where the 4th bit of the 5th, 6th and 7th bytes is set to 1.
+    // Leverages the fact that the 4th bit of a digit byte will 1.
+    // Whereas the 4th bit of the decimal point byte will be 0.
+    // Assumes little endianness.
+    private static final long DECIMAL_POINT_MASK = 0x10101000L;
+
+    // This mask performs two tasks:
+    // Sets the right-most and 3 left-most bytes to zero.
+    // Given a temp value be at most 5 bytes in length, .e.g -99.9, we can safely ignore the last 3 bytes.
+    // Subtracts 48, i.e. the UFT-8 value offset, from the digits bytes.
+    // As a result, '0' (48) becomes 0, '1' (49) becomes 1, and so on.
+    private static final long TEMP_DIGITS_MASK = 0x0f000f0f00L;
+
+    public static void main(final String[] args) throws IOException {
+        assert ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN : "Big endian byte order is not supported";
+        System.out.println(resultsToString(processFile(FILE_PATH)));
+    }
+
+    /**
+     * A custom version of AbstractMap's toString() method.
+     * <p>
+     * This should be more performant as we can:
+     * <ul>
+     *     <li>Set the initial capacity of the string builder</li>
+     *     <li>Append double values directly, which avoids string creation</li>
+     * </ul>
+     * </p>
+     *
+     * @param results the results.
+     * @return a string representation of the results.
+     */
+    private static String resultsToString(final Map<String, TemperatureData> results) {
+        final Iterator<Map.Entry<String, TemperatureData>> i = results.entrySet().iterator();
+        if (!i.hasNext()) {
+            System.out.println("{}");
+        }
+        // Capacity based the output for measurements.txt.
+        final StringBuilder sb = new StringBuilder(1100).append('{');
+        while (i.hasNext()) {
+            Map.Entry<String, TemperatureData> e = i.next();
+            sb.append(e.getKey())
+                    .append('=')
+                    .append(e.getValue().getMin())
+                    .append('/')
+                    .append(e.getValue().getMean())
+                    .append('/')
+                    .append(e.getValue().getMax());
+            if (i.hasNext()) {
+                sb.append(',').append(' ');
+            }
+        }
+        sb.append('}');
+        return sb.toString();
+    }
+
+    /**
+     * Processes the specified file.
+     * <p>
+     * Extracted from the main method for testability.
+     * </p>
+     *
+     * @param filePath the path of the file we want to process.
+     * @return a sorted map of station data keyed by station name.
+     * @throws IOException if an error occurs.
+     */
+    private static SortedMap<String, TemperatureData> processFile(final Path filePath) throws IOException {
+        assert filePath != null : "filePath cannot be null";
+        assert Files.isRegularFile(filePath) : STR."\{filePath.toAbsolutePath()} is not a valid file";
+
+        try (final FileChannel fc = FileChannel.open(filePath, StandardOpenOption.READ)) {
+            final long fileSize = fc.size();
+            if (fileSize < WORD_BYTES) {
+                // The file size is less than our word size.
+                // Keep it simple and fall back to non-performant processing.
+                return processTinyFile(fc, fileSize);
+            }
+            return processFile(fc, fileSize);
+        }
+    }
+
+    /**
+     * An unoptimised method for processing a tiny file.
+     * <p>
+     * Handling tiny files in a separate method reduces the complexity of {@link #processFile(FileChannel, long)}.
+     * </p>
+     *
+     * @param fc       the file channel to read from.
+     * @param fileSize the file size in bytes.
+     * @return a sorted map of station data keyed by station name.
+     * @throws IOException if an error occurs reading from the file channel.
+     */
+    private static SortedMap<String, TemperatureData> processTinyFile(final FileChannel fc, final long fileSize) throws IOException {
+        final ByteBuffer byteBuffer = ByteBuffer.allocate((int) fileSize);
+        fc.read(byteBuffer);
+        return new String(byteBuffer.array(), StandardCharsets.UTF_8)
+                .lines()
+                .map(line -> line.trim().split(";"))
+                .map(tokens -> {
+                    final String stationName = tokens[0];
+                    final short temp = Short.parseShort(tokens[1].replace(".", ""));
+                    return new SimpleStationData(stationName, temp);
+                })
+                .collect(Collectors.toMap(
+                        sd -> sd.name,
+                        sd -> sd,
+                        TemperatureData::merge,
+                        TreeMap::new));
+    }
+
+    /**
+     * An optimised method for processing files > {@link Long#BYTES} in size.
+     *
+     * @param fc       the file channel to map into memory.
+     * @param fileSize the file size in bytes.
+     * @return a sorted map of station data keyed by station name.
+     * @throws IOException if an error occurs mapping the file channel into memory.
+     */
+    private static SortedMap<String, TemperatureData> processFile(final FileChannel fc, final long fileSize) throws IOException {
+        assert fileSize >= WORD_BYTES : STR."File size cannot be less than word size \{WORD_BYTES}, but was \{fileSize}";
+
+        try (final Arena arena = Arena.ofConfined()) {
+            final long fileAddress = fc.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, arena).address();
+            return createChunks(fileAddress, fileSize)
+                    .parallel()
+                    .map(CalculateAverage_jonathanaotearoa::processChunk)
+                    .flatMap(Repository::entries)
+                    .collect(Collectors.toMap(
+                            StationData::getName,
+                            sd -> sd,
+                            TemperatureData::merge,
+                            TreeMap::new));
+        }
+    }
+
+    /**
+     * Divides the file into chunks that can be processed in parallel.
+     * <p>
+     * If dividing the file into {@link ForkJoinPool#getCommonPoolParallelism() parallelism} chunks would result in a
+     * chunk size less than the maximum line size in bytes, then a single chunk is returned for the entire file.
+     * </p>
+     *
+     * @param fileAddress the address of the file.
+     * @param fileSize    the size of the file in bytes.
+     * @return a stream of chunks.
+     */
+    private static Stream<Chunk> createChunks(final long fileAddress, final long fileSize) {
+        // The number of cores - 1.
+        final int parallelism = ForkJoinPool.getCommonPoolParallelism();
+        final long chunkStep = fileSize / parallelism;
+        final long lastFileByteAddress = fileAddress + fileSize - 1;
+        if (chunkStep < MAX_LINE_BYTES) {
+            // We're dealing with a small file, return a single chunk.
+            return Stream.of(new Chunk(fileAddress, lastFileByteAddress, true));
+        }
+        final Chunk[] chunks = new Chunk[parallelism];
+        long startAddress = fileAddress;
+        for (int i = 0, n = parallelism - 1; i < n; i++) {
+            // Find end of the *previous* line.
+            // We know there's a previous line in this chunk because chunkStep >= MAX_LINE_BYTES.
+            // The last chunk may be slightly bigger than the others.
+            // For a 1 billion line file, this has zero impact.
+            long lastByteAddress = startAddress + chunkStep;
+            while (UNSAFE.getByte(lastByteAddress) != NEW_LINE_BYTE) {
+                lastByteAddress--;
+            }
+            // We've found the end of the previous line.
+            chunks[i] = new Chunk(startAddress, lastByteAddress, false);
+            startAddress = ++lastByteAddress;
+        }
+        // The remaining bytes are assigned to the last chunk.
+        chunks[chunks.length - 1] = (new Chunk(startAddress, lastFileByteAddress, true));
+        return Stream.of(chunks);
+    }
+
+    /**
+     * Does the work of processing a chunk.
+     *
+     * @param chunk the chunk to process.
+     * @return a repository containing the chunk's station data.
+     */
+    private static Repository processChunk(final Chunk chunk) {
+        final Repository repo = new Repository();
+        long address = chunk.startAddress;
+
+        while (address <= chunk.lastByteAddress) {
+            // Read station name.
+            long nameAddress = address;
+            long nameWord;
+            long separatorMask;
+            int nameHash = 1;
+
+            while (true) {
+                nameWord = chunk.getWord(address);
+
+                // Based on the Hacker's Delight "Find First 0-Byte" branch-free, 5-instruction, algorithm.
+                // See also https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+                final long separatorXorResult = nameWord ^ SEPARATOR_XOR_MASK;
+                // If the separator is not present, all bits in the mask will be zero.
+                // If the separator is present, the first bit of the corresponding byte in the mask will be 1.
+                separatorMask = (separatorXorResult - 0x0101010101010101L) & (~separatorXorResult & 0x8080808080808080L);
+                if (separatorMask == 0) {
+                    address += Long.BYTES;
+                    // Multiplicative hashing, as per Arrays.hashCode().
+                    // We could use XOR here, but it "might" produce more collisions.
+                    nameHash = 31 * nameHash + Long.hashCode(nameWord);
+                }
+                else {
+                    break;
+                }
+            }
+
+            // We've found the separator.
+            // We only support little endian, so we use the *trailing* number of zeros to get the number of name bits.
+            final int numberOfNameBits = Long.numberOfTrailingZeros(separatorMask) & ~7;
+            final int numberOfNameBytes = numberOfNameBits >> 3;
+            final long separatorAddress = address + numberOfNameBytes;
+
+            if (numberOfNameBytes > 0) {
+                // Truncate the word, so we only have the portion before the separator, i.e. the name bytes.
+                final int bitsToDiscard = Long.SIZE - numberOfNameBits;
+                // Little endian.
+                final long truncatedNameWord = (nameWord << bitsToDiscard) >>> bitsToDiscard;
+                nameHash = 31 * nameHash + Long.hashCode(truncatedNameWord);
+            }
+
+            final long tempAddress = separatorAddress + 1;
+            final long tempWord = chunk.getWord(tempAddress);
+
+            // "0" in UTF-8 is 48, which is 00110000 in binary.
+            // The first 4 bits of any UTF-8 digit byte are therefore 0011.
+
+            // Get the position of the decimal point...
+            // "." in UTF-8 is 46, which is 00101110 in binary.
+            // We can therefore use the 4th bit to check which byte is the decimal point.
+            final int decimalPointIndex = Long.numberOfTrailingZeros(~tempWord & DECIMAL_POINT_MASK) >> 3;
+
+            // Check if we've got a negative or positive number...
+            // "-" in UTF-8 is 45, which is 00101101 in binary.
+            // As per above, we use the 4th bit to check if the word contains a positive, or negative, temperature.
+            // If the temperature is negative, the value of "sign" will be -1. If it's positive, it'll be 0.
+            final long sign = (~tempWord << 59) >> 63;
+
+            // Create a mask that zeros out the minus-sign byte, if present.
+            // Little endian, i.e. the minus sign is the right-most byte.
+            final long signMask = ~(sign & 0xFF);
+
+            // To get the temperature value, we left-shift the digit bytes into the following, known, positions.
+            // 0x00 0x00 0x00 <fractional-digit> 0x00 <integer-part-digit> <integer-part-digit> 0x00
+            // Because we're ANDing with the sign mask, if the value only has a single integer-part digit, the right-most one will be zero.
+            final int leftShift = (3 - decimalPointIndex) * Byte.SIZE;
+            final long digitsWord = ((tempWord & signMask) << leftShift) & TEMP_DIGITS_MASK;
+
+            // Get the unsigned int value.
+            final byte b100 = (byte) (digitsWord >> 8);
+            final byte b10 = (byte) (digitsWord >> 16);
+            final byte b1 = (byte) (digitsWord >> 32);
+            final short unsignedTemp = (short) (b100 * 100 + b10 * 10 + b1);
+            final short temp = (short) ((unsignedTemp + sign) ^ sign);
+
+            final byte nameSize = (byte) (separatorAddress - nameAddress);
+            repo.addTemp(nameHash, nameAddress, nameSize, temp);
+
+            // Calculate the address of the next line.
+            address = tempAddress + decimalPointIndex + 3;
+        }
+
+        return repo;
+    }
+
+    /**
+     * Represents a portion of a file containing 1 or more whole lines.
+     *
+     * @param startAddress    the memory address of the first byte.
+     * @param lastByteAddress the memory address of the last byte.
+     * @param lastWordAddress the memory address of the last whole word.
+     * @param isLast          whether this is the last chunk.
+     */
+    private record Chunk(long startAddress, long lastByteAddress, long lastWordAddress, boolean isLast) {
+
+        public Chunk(final long startAddress, final long lastByteAddress, final boolean isLast) {
+            this(startAddress, lastByteAddress, lastByteAddress - (Long.BYTES - 1), isLast);
+
+            assert lastByteAddress > startAddress : STR."lastByteAddress \{lastByteAddress} must be > startAddress \{startAddress}";
+            assert lastWordAddress >= startAddress : STR."lastWordAddress \{lastWordAddress} must be >= startAddress \{startAddress}";
+        }
+
+        /**
+         * Gets an 8 byte word from this chunk.
+         * <p>
+         * If the specified address is greater than {@link Chunk#lastWordAddress} and {@link Chunk#isLast}, the word
+         * will be truncated. This ensures we never read beyond the end of the file.
+         * </p>
+         *
+         * @param address the address of the word we want.
+         * @return the word at the specified address.
+         */
+        public long getWord(final long address) {
+            assert address >= startAddress : STR."address must be >= startAddress \{startAddress}, but was \{address}";
+            assert address < lastByteAddress : STR."address must be < lastByteAddress \{lastByteAddress}, but was \{address}";
+
+            if (isLast && address > lastWordAddress) {
+                // Make sure we don't read beyond the end of the file and potentially crash the JVM.
+                final long word = UNSAFE.getLong(lastWordAddress);
+                final int bytesToDiscard = (int) (address - lastWordAddress);
+                // As with elsewhere, this assumes little endianness.
+                return word >>> (bytesToDiscard << 3);
+            }
+            return UNSAFE.getLong(address);
+        }
+    }
+
+    /**
+     * Abstract class encapsulating temperature data.
+     */
+    private static abstract class TemperatureData {
+
+        private short min;
+        private short max;
+        private long sum;
+        private int count;
+
+        protected TemperatureData(final short temp) {
+            min = max = temp;
+            sum = temp;
+            count = 1;
+        }
+
+        void addTemp(final short temp) {
+            if (temp < min) {
+                min = temp;
+            }
+            else if (temp > max) {
+                max = temp;
+            }
+            sum += temp;
+            count++;
+        }
+
+        TemperatureData merge(final TemperatureData other) {
+            if (other.min < min) {
+                min = other.min;
+            }
+            if (other.max > max) {
+                max = other.max;
+            }
+            sum += other.sum;
+            count += other.count;
+            return this;
+        }
+
+        double getMin() {
+            return round(((double) min) / 10.0);
+        }
+
+        double getMax() {
+            return round(((double) max) / 10.0);
+        }
+
+        double getMean() {
+            return round((((double) sum) / 10.0) / count);
+        }
+
+        private static double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+    /**
+     * For use with tiny files.
+     *
+     * @see CalculateAverage_jonathanaotearoa#processTinyFile(FileChannel, long).
+     */
+    private static final class SimpleStationData extends TemperatureData implements Comparable<SimpleStationData> {
+
+        private final String name;
+
+        SimpleStationData(final String name, final short temp) {
+            super(temp);
+            this.name = name;
+        }
+
+        @Override
+        public int compareTo(final SimpleStationData other) {
+            return name.compareTo(other.name);
+        }
+    }
+
+    private static final class StationData extends TemperatureData implements Comparable<StationData> {
+
+        private final int nameHash;
+        private final long nameAddress;
+        private final byte nameSize;
+        private String name;
+
+        StationData(final int nameHash, final long nameAddress, final byte nameSize, final short temp) {
+            super(temp);
+            this.nameAddress = nameAddress;
+            this.nameSize = nameSize;
+            this.nameHash = nameHash;
+        }
+
+        @Override
+        public int compareTo(final StationData other) {
+            return getName().compareTo(other.getName());
+        }
+
+        String getName() {
+            if (name == null) {
+                final byte[] nameBytes = new byte[nameSize];
+                UNSAFE.copyMemory(null, nameAddress, nameBytes, UNSAFE.arrayBaseOffset(nameBytes.getClass()), nameSize);
+                name = new String(nameBytes, StandardCharsets.UTF_8);
+            }
+            return name;
+        }
+    }
+
+    /**
+     * Open addressing, linear probing, hash map repository.
+     */
+    private static final class Repository {
+
+        private static final int CAPACITY = 100_003;
+        private static final int LAST_INDEX = CAPACITY - 1;
+
+        private final StationData[] table;
+
+        public Repository() {
+            this.table = new StationData[CAPACITY];
+        }
+
+        /**
+         * Adds a station temperature value to this repository.
+         *
+         * @param nameHash    the station name hash.
+         * @param nameAddress the station name address in memory.
+         * @param nameSize    the station name size in bytes.
+         * @param temp        the temperature value.
+         */
+        public void addTemp(final int nameHash, final long nameAddress, final byte nameSize, short temp) {
+            final int index = findIndex(nameHash, nameAddress, nameSize);
+            if (table[index] == null) {
+                table[index] = new StationData(nameHash, nameAddress, nameSize, temp);
+            }
+            else {
+                table[index].addTemp(temp);
+            }
+        }
+
+        public Stream<StationData> entries() {
+            return Arrays.stream(table).filter(Objects::nonNull);
+        }
+
+        private int findIndex(int nameHash, final long nameAddress, final byte nameSize) {
+            // Think about replacing modulo.
+            // https://lemire.me/blog/2018/08/20/performance-of-ranged-accesses-into-arrays-modulo-multiply-shift-and-masks/
+            int index = (nameHash & 0x7FFFFFFF) % CAPACITY;
+            while (isCollision(index, nameHash, nameAddress, nameSize)) {
+                index = index == LAST_INDEX ? 0 : index + 1;
+            }
+            return index;
+        }
+
+        private boolean isCollision(final int index, final long nameHash, final long nameAddress, final byte nameSize) {
+            final StationData existing = table[index];
+            if (existing == null) {
+                return false;
+            }
+            if (nameHash != existing.nameHash) {
+                return true;
+            }
+            if (nameSize != existing.nameSize) {
+                return true;
+            }
+            // Last resort; check if the names are the same.
+            // This is real performance hit :(
+            return !isMemoryEqual(nameAddress, existing.nameAddress, nameSize);
+        }
+
+        /**
+         * Checks if two locations in memory have the same value.
+         *
+         * @param address1 the address of the first location.
+         * @param address2 the address of the second locations.
+         * @param size     the number of bytes to check for equality.
+         * @return true if both addresses contain the same bytes.
+         */
+        private static boolean isMemoryEqual(final long address1, final long address2, final byte size) {
+            // Checking 1 byte at a time, so we can bail as early as possible.
+            for (int offset = 0; offset < size; offset++) {
+                final byte b1 = UNSAFE.getByte(address1 + offset);
+                final byte b2 = UNSAFE.getByte(address2 + offset);
+                if (b1 != b2) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+
+    /**
+     * Helper for running tests without blowing away the main measurements.txt file.
+     * Saves regenerating the 1 billion line file after each test run.
+     * Enable assertions in the IDE run config.
+     */
+    public static final class TestRunner {
+        public static void main(String[] args) throws IOException {
+            final StringBuilder testResults = new StringBuilder();
+            try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(SAMPLE_DIR_PATH, "*.txt")) {
+                dirStream.forEach(filePath -> {
+                    testResults.append(STR."Testing '\{filePath.getFileName()}'... ");
+                    final String expectedResultFileName = filePath.getFileName().toString().replace(".txt", ".out");
+                    try {
+                        final String expected = Files.readString(SAMPLE_DIR_PATH.resolve(expectedResultFileName));
+                        final SortedMap<String, TemperatureData> results = processFile(filePath);
+                        // Appending \n to the results string to mimic println().
+                        final String actual = STR."\{resultsToString(results)}\n";
+                        if (actual.equals(expected)) {
+                            testResults.append("Passed\n");
+                        } else {
+                            testResults.append("Failed. Actual output does not match expected\n");
+                        }
+                    } catch (IOException e) {
+                        throw new RuntimeException(STR."Error testing '\{filePath.getFileName()}");
+                    }
+                });
+            } finally {
+                System.out.println(testResults);
+            }
+        }
+    }
+}

From 97334e862187b958425acc495575d481be0becf3 Mon Sep 17 00:00:00 2001
From: John Ziamos <iziamos@gmail.com>
Date: Sun, 28 Jan 2024 16:03:42 +0000
Subject: [PATCH 164/268] use long for string equals (#613)

use more generic hashcode
---
 .../onebrc/CalculateAverage_iziamos.java      | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java b/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
index ad2bf052b..faa60547d 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
@@ -185,7 +185,7 @@ public int getStringLength() {
 
             byte b = UNSAFE.getByte(pointer);
             for (; b != ';'; ++strLen, b = UNSAFE.getByte(pointer + strLen)) {
-                hash += b << strLen;
+                hash = 31 * hash + b;
             }
             pointer += strLen + 1;
 
@@ -351,22 +351,25 @@ private static int findSlot(final long baseAddress,
             }
         }
 
-        private static boolean stringEquals(final long thisNameAddress, final int thisStringLength, final long otherNameAddress, final long otherNameLength) {
+        private static boolean stringEquals(final long thisNameAddress,
+                                            final int thisStringLength,
+                                            final long otherNameAddress,
+                                            final long otherNameLength) {
             if (thisStringLength != otherNameLength) {
                 return false;
             }
 
             int i = 0;
-            for (; i < thisStringLength - 3; i += 4) {
-                if (UNSAFE.getInt(thisNameAddress + i) != UNSAFE.getInt(otherNameAddress + i)) {
+            for (; i < thisStringLength - 7; i += 8) {
+                if (UNSAFE.getLong(thisNameAddress + i) != UNSAFE.getLong(otherNameAddress + i)) {
                     return false;
                 }
             }
 
-            final int remainingToCheck = thisStringLength - i;
-            final int finalBytesMask = ((1 << remainingToCheck * 8)) - 1;
-            final int thisLastWord = UNSAFE.getInt(thisNameAddress + i);
-            final int otherLastWord = UNSAFE.getInt(otherNameAddress + i);
+            final long remainingToCheck = thisStringLength - i;
+            final long finalBytesMask = ((1L << remainingToCheck * 8)) - 1;
+            final long thisLastWord = UNSAFE.getLong(thisNameAddress + i);
+            final long otherLastWord = UNSAFE.getLong(otherNameAddress + i);
 
             return 0 == ((thisLastWord ^ otherLastWord) & finalBytesMask);
         }

From b3d6659d6828a44bcdd341a3a39bcb7f437cb548 Mon Sep 17 00:00:00 2001
From: Andrzej Nestoruk <and.nestoruk@gmail.com>
Date: Sun, 28 Jan 2024 17:12:32 +0100
Subject: [PATCH 165/268] anestoruk submission  (#617)

* initial implementation

* few improvements and a cleanup (down to ~12s)
---
 calculate_average_anestoruk.sh                |  19 ++
 .../onebrc/CalculateAverage_anestoruk.java    | 193 ++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100755 calculate_average_anestoruk.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java

diff --git a/calculate_average_anestoruk.sh b/calculate_average_anestoruk.sh
new file mode 100755
index 000000000..9db63c898
--- /dev/null
+++ b/calculate_average_anestoruk.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_anestoruk
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
new file mode 100644
index 000000000..add938fca
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
@@ -0,0 +1,193 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import static java.lang.Math.ceil;
+import static java.lang.Math.max;
+import static java.lang.Math.min;
+import static java.lang.Runtime.getRuntime;
+import static java.lang.foreign.ValueLayout.JAVA_BYTE;
+import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.concurrent.CompletableFuture.supplyAsync;
+
+public class CalculateAverage_anestoruk {
+
+    private static final String path = "./measurements.txt";
+    private static final int cpus = getRuntime().availableProcessors();
+
+    public static void main(String[] args) throws IOException {
+        List<SegmentRange> rangeList = new ArrayList<>();
+        MemorySegment segment;
+
+        try (FileChannel channel = FileChannel.open(Path.of(path))) {
+            final long fileSize = channel.size();
+            final long chunkSize = fileSize > 10_000 ? min(Integer.MAX_VALUE - 256, fileSize / cpus) : fileSize;
+            final int chunks = (int) ceil((double) fileSize / chunkSize);
+            segment = channel.map(READ_ONLY, 0, fileSize, Arena.global());
+            long startOffset = 0;
+            long size = chunkSize;
+            for (int i = 0; i < chunks && size > 0; i++) {
+                long endOffset = startOffset + size;
+                while (endOffset < fileSize && segment.get(JAVA_BYTE, endOffset) != '\n') {
+                    endOffset++;
+                }
+                rangeList.add(new SegmentRange(startOffset, endOffset));
+                startOffset = endOffset + 1;
+                size = min(chunkSize, fileSize - startOffset);
+            }
+        }
+
+        TreeMap<String, Record> result = new TreeMap<>();
+        try (ExecutorService executor = Executors.newFixedThreadPool(cpus)) {
+            List<CompletableFuture<Map<ByteWrapper, Record>>> futures = new ArrayList<>();
+            for (SegmentRange range : rangeList) {
+                futures.add(supplyAsync(() -> process(range, segment), executor));
+            }
+            for (CompletableFuture<Map<ByteWrapper, Record>> future : futures) {
+                try {
+                    Map<ByteWrapper, Record> partialResult = future.get();
+                    combine(result, partialResult);
+                }
+                catch (InterruptedException | ExecutionException ex) {
+                    throw new RuntimeException(ex);
+                }
+            }
+        }
+
+        System.out.println(result);
+    }
+
+    private static Map<ByteWrapper, Record> process(SegmentRange range, MemorySegment segment) {
+        Map<ByteWrapper, Record> partialResult = new HashMap<>(1_000);
+        byte[] buffer = new byte[100];
+        long offset = range.startOffset;
+        byte b;
+        while (offset < range.endOffset) {
+            int cityIdx = 0;
+            while ((b = segment.get(JAVA_BYTE, offset++)) != ';') {
+                buffer[cityIdx++] = b;
+            }
+            byte[] city = new byte[cityIdx];
+            System.arraycopy(buffer, 0, city, 0, cityIdx);
+            ByteWrapper cityWrapper = new ByteWrapper(city);
+
+            int value = 0;
+            boolean negative;
+            if ((b = segment.get(JAVA_BYTE, offset++)) == '-') {
+                negative = true;
+            }
+            else {
+                negative = false;
+                value = b - '0';
+            }
+            while ((b = segment.get(JAVA_BYTE, offset++)) != '\n') {
+                if (b != '.') {
+                    value = value * 10 + (b - '0');
+                }
+            }
+            int temperature = negative ? -value : value;
+
+            partialResult.compute(cityWrapper, (_, record) -> update(record, temperature));
+        }
+        return partialResult;
+    }
+
+    private record SegmentRange(long startOffset, long endOffset) {
+    }
+
+    private record ByteWrapper(byte[] bytes) {
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            ByteWrapper that = (ByteWrapper) o;
+            return Arrays.equals(bytes, that.bytes);
+        }
+
+        @Override
+        public int hashCode() {
+            return Arrays.hashCode(bytes);
+        }
+    }
+
+    private static class Record {
+
+        private int min;
+        private int max;
+        private long sum;
+        private int count;
+
+        public Record(int temperature) {
+            this.min = temperature;
+            this.max = temperature;
+            this.sum = temperature;
+            this.count = 1;
+        }
+
+        @Override
+        public String toString() {
+            return "%.1f/%.1f/%.1f".formatted(
+                    (min / 10.0),
+                    ((double) sum / count / 10.0),
+                    (max / 10.0));
+        }
+    }
+
+    private static Record update(Record record, int temperature) {
+        if (record == null) {
+            return new Record(temperature);
+        }
+        record.min = min(record.min, temperature);
+        record.max = max(record.max, temperature);
+        record.sum += temperature;
+        record.count++;
+        return record;
+    }
+
+    private static void combine(TreeMap<String, Record> result, Map<ByteWrapper, Record> partialResult) {
+        partialResult.forEach((wrapper, partialRecord) -> {
+            String city = new String(wrapper.bytes, UTF_8);
+            result.compute(city, (_, record) -> {
+                if (record == null) {
+                    return partialRecord;
+                }
+                record.min = min(record.min, partialRecord.min);
+                record.max = max(record.max, partialRecord.max);
+                record.sum += partialRecord.sum;
+                record.count += partialRecord.count;
+                return record;
+            });
+        });
+    }
+}

From 9531407119ee14ad867d6f21e999d6c44c188a5a Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 17:13:21 +0100
Subject: [PATCH 166/268] Leaderboard update

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6c4a337d7..cb64f3704 100644
--- a/README.md
+++ b/README.md
@@ -62,12 +62,13 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
-|   | 00:04.154 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
+|   | 00:04.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
 |   | 00:04.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
+|   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathan-aotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
@@ -126,6 +127,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
 |   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
+|   | 00:13.029 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java)| 21.0.1-open | [Andrzej Nestoruk](https://github.com/anestoruk) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |

From c8dd691a2751aa250ea8d9ba827666a7699e7411 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 17:59:46 +0100
Subject: [PATCH 167/268] Leaderboard update

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index cb64f3704..87cbf8662 100644
--- a/README.md
+++ b/README.md
@@ -41,11 +41,11 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.146 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
-| 2* | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-| 3 | 00:02.195 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 1 | 00:01.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.146 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 |   | 00:02.512 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |

From 7e525c599236a24984b3b1f2b8fd7ec5dbb960bc Mon Sep 17 00:00:00 2001
From: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
Date: Sun, 28 Jan 2024 17:59:57 +0100
Subject: [PATCH 168/268] Some fine tuning for thomaswue (#606)

* Some fine tuning.

* Process 2MB segments to make all threads finish at the same time.
Process with 3 scanners in parallel in the same thread.
---
 prepare_thomaswue.sh                          |   2 +-
 .../onebrc/CalculateAverage_thomaswue.java    | 391 +++++++++++-------
 2 files changed, 240 insertions(+), 153 deletions(-)

diff --git a/prepare_thomaswue.sh b/prepare_thomaswue.sh
index 32616a958..10dc73280 100755
--- a/prepare_thomaswue.sh
+++ b/prepare_thomaswue.sh
@@ -20,7 +20,7 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_thomaswue_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:-GenLoopSafepoints -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
     # Use -H:MethodFilter=CalculateAverage_thomaswue.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_thomaswue_image dev.morling.onebrc.CalculateAverage_thomaswue
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
index 406c85d38..c02a8813a 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
@@ -23,16 +23,17 @@
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.*;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.IntStream;
 
 /**
  * Simple solution that memory maps the input file, then splits it into one segment per available core and uses
  * sun.misc.Unsafe to directly access the mapped memory. Uses a long at a time when checking for collision.
  * <p>
- * Runs in 0.60s on my Intel i9-13900K
+ * Runs in 0.41s on my Intel i9-13900K
  * Perf stats:
- *     34,716,719,245      cpu_core/cycles/
- *     40,776,530,892      cpu_atom/cycles/
+ *     25,286,227,376      cpu_core/cycles/
+ *     26,833,723,225      cpu_atom/cycles/
  */
 public class CalculateAverage_thomaswue {
     private static final String FILE = "./measurements.txt";
@@ -42,10 +43,11 @@ public class CalculateAverage_thomaswue {
     // Holding the current result for a single city.
     private static class Result {
         long lastNameLong, secondLastNameLong;
-        long[] name;
-        int count;
-        short min, max;
+        long min, max;
         long sum;
+        int count;
+        long[] name;
+        String nameAsString;
 
         private Result() {
             this.min = MAX_TEMP;
@@ -73,36 +75,59 @@ private void add(Result other) {
         }
 
         public String calcName() {
-            ByteBuffer bb = ByteBuffer.allocate(name.length * Long.BYTES).order(ByteOrder.nativeOrder());
-            bb.asLongBuffer().put(name);
-            byte[] array = bb.array();
-            int i = 0;
-            while (array[i++] != ';')
-                ;
-            return new String(array, 0, i - 1, StandardCharsets.UTF_8);
+            if (nameAsString == null) {
+                ByteBuffer bb = ByteBuffer.allocate(name.length * Long.BYTES).order(ByteOrder.nativeOrder());
+                bb.asLongBuffer().put(name);
+                byte[] array = bb.array();
+                int i = 0;
+                while (array[i++] != ';')
+                    ;
+                nameAsString = new String(array, 0, i - 1, StandardCharsets.UTF_8);
+            }
+            return nameAsString;
         }
     }
 
-    public static void main(String[] args) throws IOException {
+    public static void main(String[] args) throws IOException, InterruptedException {
         if (args.length == 0 || !("--worker".equals(args[0]))) {
             spawnWorker();
             return;
         }
         // Calculate input segments.
-        int numberOfChunks = Runtime.getRuntime().availableProcessors();
-        long[] chunks = getSegments(numberOfChunks);
+        int numberOfWorkers = Runtime.getRuntime().availableProcessors();
+        final AtomicLong cursor = new AtomicLong();
+        final long fileEnd;
+        final long fileStart;
+
+        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+            long fileSize = fileChannel.size();
+            fileStart = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
+            cursor.set(fileStart);
+            fileEnd = fileStart + fileSize;
+        }
 
         // Parallel processing of segments.
-        List<List<Result>> allResults = IntStream.range(0, chunks.length - 1).mapToObj(chunkIndex -> parseLoop(chunks[chunkIndex], chunks[chunkIndex + 1]))
-                .map(resultArray -> {
-                    List<Result> results = new ArrayList<>();
-                    for (Result r : resultArray) {
-                        if (r != null) {
-                            results.add(r);
-                        }
+        Thread[] threads = new Thread[numberOfWorkers];
+        List<Result>[] allResults = new List[numberOfWorkers];
+        for (int i = 0; i < threads.length; ++i) {
+            final int index = i;
+            threads[i] = new Thread(() -> {
+                Result[] resultArray = parseLoop(cursor, fileEnd, fileStart);
+                List<Result> results = new ArrayList<>(500);
+                for (Result r : resultArray) {
+                    if (r != null) {
+                        r.calcName();
+                        results.add(r);
                     }
-                    return results;
-                }).parallel().toList();
+                }
+                allResults[index] = results;
+            });
+            threads[i].start();
+        }
+
+        for (Thread thread : threads) {
+            thread.join();
+        }
 
         // Final output.
         System.out.println(accumulateResults(allResults));
@@ -115,17 +140,12 @@ private static void spawnWorker() throws IOException {
         info.command().ifPresent(workerCommand::add);
         info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
         workerCommand.add("--worker");
-        new ProcessBuilder()
-                .command(workerCommand)
-                .inheritIO()
-                .redirectOutput(ProcessBuilder.Redirect.PIPE)
-                .start()
-                .getInputStream()
-                .transferTo(System.out);
+        new ProcessBuilder().command(workerCommand).inheritIO().redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start().getInputStream().transferTo(System.out);
     }
 
     // Accumulate results sequentially for simplicity.
-    private static TreeMap<String, Result> accumulateResults(List<List<Result>> allResults) {
+    private static TreeMap<String, Result> accumulateResults(List<Result>[] allResults) {
         TreeMap<String, Result> result = new TreeMap<>();
         for (List<Result> resultArr : allResults) {
             for (Result r : resultArr) {
@@ -139,141 +159,220 @@ private static TreeMap<String, Result> accumulateResults(List<List<Result>> allR
         return result;
     }
 
-    // Main parse loop.
-    private static Result[] parseLoop(long chunkStart, long chunkEnd) {
-        Result[] results = new Result[1 << 17];
-        Scanner scanner = new Scanner(chunkStart, chunkEnd);
-        long word = scanner.getLong();
-        long pos = findDelimiter(word);
-        while (scanner.hasNext()) {
-            long nameAddress = scanner.pos();
-            long hash = 0;
-
-            // Search for ';', one long at a time.
+    private static Result findResult(long initialWord, long initialPos, Scanner scanner, Result[] results) {
+
+        Result existingResult;
+        long word = initialWord;
+        long pos = initialPos;
+        long hash;
+        long nameAddress = scanner.pos();
+
+        // Search for ';', one long at a time.
+        if (pos != 0) {
+            pos = Long.numberOfTrailingZeros(pos) >>> 3;
+            scanner.add(pos);
+            word = mask(word, pos);
+            hash = word;
+
+            int index = hashToIndex(hash, results);
+            existingResult = results[index];
+
+            if (existingResult != null && existingResult.lastNameLong == word) {
+                return existingResult;
+            }
+            else {
+                scanner.setPos(nameAddress + pos);
+            }
+        }
+        else {
+            scanner.add(8);
+            hash = word;
+            long prevWord = word;
+            word = scanner.getLong();
+            pos = findDelimiter(word);
             if (pos != 0) {
                 pos = Long.numberOfTrailingZeros(pos) >>> 3;
                 scanner.add(pos);
                 word = mask(word, pos);
-                hash = word;
-
-                int number = scanNumber(scanner);
-                long nextWord = scanner.getLong();
-                long nextPos = findDelimiter(nextWord);
-
-                Result existingResult = results[hashToIndex(hash, results)];
-                if (existingResult != null && existingResult.lastNameLong == word) {
-                    word = nextWord;
-                    pos = nextPos;
-                    record(existingResult, number);
-                    continue;
-                }
+                hash ^= word;
+                int index = hashToIndex(hash, results);
+                existingResult = results[index];
 
-                scanner.setPos(nameAddress + pos);
+                if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
+                    return existingResult;
+                }
+                else {
+                    scanner.setPos(nameAddress + pos + 8);
+                }
             }
             else {
                 scanner.add(8);
-                hash = word;
-                long prevWord = word;
-                word = scanner.getLong();
-                pos = findDelimiter(word);
-                if (pos != 0) {
-                    pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                    scanner.add(pos);
-                    word = mask(word, pos);
-                    hash ^= word;
-
-                    Result existingResult = results[hashToIndex(hash, results)];
-                    if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
-                        int number = scanNumber(scanner);
-                        word = scanner.getLong();
-                        pos = findDelimiter(word);
-                        record(existingResult, number);
-                        continue;
+                hash ^= word;
+                while (true) {
+                    word = scanner.getLong();
+                    pos = findDelimiter(word);
+                    if (pos != 0) {
+                        pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                        scanner.add(pos);
+                        word = mask(word, pos);
+                        hash ^= word;
+                        break;
                     }
-                }
-                else {
-                    scanner.add(8);
-                    hash ^= word;
-                    while (true) {
-                        word = scanner.getLong();
-                        pos = findDelimiter(word);
-                        if (pos != 0) {
-                            pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                            scanner.add(pos);
-                            word = mask(word, pos);
-                            hash ^= word;
-                            break;
-                        }
-                        else {
-                            scanner.add(8);
-                            hash ^= word;
-                        }
+                    else {
+                        scanner.add(8);
+                        hash ^= word;
                     }
                 }
             }
+        }
 
-            // Save length of name for later.
-            int nameLength = (int) (scanner.pos() - nameAddress);
-            int number = scanNumber(scanner);
+        // Save length of name for later.
+        int nameLength = (int) (scanner.pos() - nameAddress);
 
-            // Final calculation for index into hash table.
-            int tableIndex = hashToIndex(hash, results);
-            outer: while (true) {
-                Result existingResult = results[tableIndex];
-                if (existingResult == null) {
-                    existingResult = newEntry(results, nameAddress, tableIndex, nameLength, scanner);
-                }
-                // Check for collision.
-                int i = 0;
-                int namePos = 0;
-                for (; i < nameLength + 1 - 8; i += 8) {
-                    if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
-                        tableIndex = (tableIndex + 31) & (results.length - 1);
-                        continue outer;
-                    }
+        // Final calculation for index into hash table.
+        int tableIndex = hashToIndex(hash, results);
+        outer: while (true) {
+            existingResult = results[tableIndex];
+            if (existingResult == null) {
+                existingResult = newEntry(results, nameAddress, tableIndex, nameLength, scanner);
+            }
+            // Check for collision.
+            int i = 0;
+            long[] name = existingResult.name;
+            for (; i < nameLength + 1 - 8; i += 8) {
+                if (scanner.getLongAt(i, name) != scanner.getLongAt(nameAddress + i)) {
+                    tableIndex = (tableIndex + 31) & (results.length - 1);
+                    continue outer;
                 }
+            }
 
-                int remainingShift = (64 - (nameLength + 1 - i) << 3);
-                if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
-                    record(existingResult, number);
+            int remainingShift = (64 - (nameLength + 1 - i) << 3);
+            if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
+                break;
+            }
+            else {
+                // Collision error, try next.
+                tableIndex = (tableIndex + 31) & (results.length - 1);
+            }
+        }
+        return existingResult;
+    }
+
+    private static long nextNL(long prev) {
+        while (true) {
+            long currentWord = Scanner.UNSAFE.getLong(prev);
+            long pos = findNewLine(currentWord);
+            if (pos != 0) {
+                prev += Long.numberOfTrailingZeros(pos) >>> 3;
+                break;
+            }
+            else {
+                prev += 8;
+            }
+        }
+        return prev;
+    }
+
+    private static final int SEGMENT_SIZE = 1024 * 1024 * 2;
+
+    // Main parse loop.
+    private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileStart) {
+        Result[] results = new Result[1 << 17];
+
+        while (true) {
+            long current = counter.addAndGet(SEGMENT_SIZE) - SEGMENT_SIZE;
+
+            if (current >= fileEnd) {
+                return results;
+            }
+
+            long segmentEnd = nextNL(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
+            long segmentStart;
+            if (current == fileStart) {
+                segmentStart = current;
+            }
+            else {
+                segmentStart = nextNL(current) + 1;
+            }
+
+            long dist = (segmentEnd - segmentStart) / 3;
+            long midPoint1 = nextNL(segmentStart + dist);
+            long midPoint2 = nextNL(segmentStart + dist + dist);
+
+            Scanner scanner1 = new Scanner(segmentStart, midPoint1);
+            Scanner scanner2 = new Scanner(midPoint1 + 1, midPoint2);
+            Scanner scanner3 = new Scanner(midPoint2 + 1, segmentEnd);
+            while (true) {
+                if (!scanner1.hasNext()) {
                     break;
                 }
-                else {
-                    // Collision error, try next.
-                    tableIndex = (tableIndex + 31) & (results.length - 1);
+                if (!scanner2.hasNext()) {
+                    break;
                 }
+                if (!scanner3.hasNext()) {
+                    break;
+                }
+
+                long word1 = scanner1.getLong();
+                long word2 = scanner2.getLong();
+                long word3 = scanner3.getLong();
+                long pos1 = findDelimiter(word1);
+                long pos2 = findDelimiter(word2);
+                long pos3 = findDelimiter(word3);
+                Result existingResult1 = findResult(word1, pos1, scanner1, results);
+                Result existingResult2 = findResult(word2, pos2, scanner2, results);
+                Result existingResult3 = findResult(word3, pos3, scanner3, results);
+                long number1 = scanNumber(scanner1);
+                long number2 = scanNumber(scanner2);
+                long number3 = scanNumber(scanner3);
+                record(existingResult1, number1);
+                record(existingResult2, number2);
+                record(existingResult3, number3);
             }
 
-            word = scanner.getLong();
-            pos = findDelimiter(word);
+            while (scanner1.hasNext()) {
+                long word = scanner1.getLong();
+                long pos = findDelimiter(word);
+                record(findResult(word, pos, scanner1, results), scanNumber(scanner1));
+            }
+
+            while (scanner2.hasNext()) {
+                long word = scanner2.getLong();
+                long pos = findDelimiter(word);
+                record(findResult(word, pos, scanner2, results), scanNumber(scanner2));
+            }
+
+            while (scanner3.hasNext()) {
+                long word = scanner3.getLong();
+                long pos = findDelimiter(word);
+                record(findResult(word, pos, scanner3, results), scanNumber(scanner3));
+            }
         }
-        return results;
     }
 
-    private static int scanNumber(Scanner scanPtr) {
+    private static long scanNumber(Scanner scanPtr) {
         scanPtr.add(1);
         long numberWord = scanPtr.getLong();
         int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
-        int number = convertIntoNumber(decimalSepPos, numberWord);
+        long number = convertIntoNumber(decimalSepPos, numberWord);
         scanPtr.add((decimalSepPos >>> 3) + 3);
         return number;
     }
 
-    private static void record(Result existingResult, int number) {
+    private static void record(Result existingResult, long number) {
         if (number < existingResult.min) {
-            existingResult.min = (short) number;
+            existingResult.min = number;
         }
         if (number > existingResult.max) {
-            existingResult.max = (short) number;
+            existingResult.max = number;
         }
         existingResult.sum += number;
         existingResult.count++;
     }
 
     private static int hashToIndex(long hash, Result[] results) {
-        int hashAsInt = (int) (hash ^ (hash >>> 28));
-        int finalHash = (hashAsInt ^ (hashAsInt >>> 17));
-        return (finalHash & (results.length - 1));
+        long hashAsInt = hash ^ (hash >>> 37) ^ (hash >>> 17);
+        return (int) (hashAsInt & (results.length - 1));
     }
 
     private static long mask(long word, long pos) {
@@ -281,7 +380,7 @@ private static long mask(long word, long pos) {
     }
 
     // Special method to convert a number in the ascii number into an int without branches created by Quan Anh Mai.
-    private static int convertIntoNumber(int decimalSepPos, long numberWord) {
+    private static long convertIntoNumber(int decimalSepPos, long numberWord) {
         int shift = 28 - decimalSepPos;
         // signed is -1 if negative, 0 otherwise
         long signed = (~numberWord << 59) >> 63;
@@ -292,8 +391,7 @@ private static int convertIntoNumber(int decimalSepPos, long numberWord) {
         // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
         // 0x000000UU00TTHH00 + 0x00UU00TTHH000000 * 10 + 0xUU00TTHH00000000 * 100
         long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-        long value = (absValue ^ signed) - signed;
-        return (int) value;
+        return (absValue ^ signed) - signed;
     }
 
     private static long findDelimiter(long word) {
@@ -302,6 +400,12 @@ private static long findDelimiter(long word) {
         return tmp;
     }
 
+    private static long findNewLine(long word) {
+        long input = word ^ 0x0A0A0A0A0A0A0A0AL;
+        long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
+        return tmp;
+    }
+
     private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner) {
         Result r = new Result();
         results[hash] = r;
@@ -324,27 +428,6 @@ private static Result newEntry(Result[] results, long nameAddress, int hash, int
         return r;
     }
 
-    private static long[] getSegments(int numberOfChunks) throws IOException {
-        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
-            long fileSize = fileChannel.size();
-            long segmentSize = (fileSize + numberOfChunks - 1) / numberOfChunks;
-            long[] chunks = new long[numberOfChunks + 1];
-            long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
-            chunks[0] = mappedAddress;
-            long endAddress = mappedAddress + fileSize;
-            Scanner s = new Scanner(mappedAddress, mappedAddress + fileSize);
-            for (int i = 1; i < numberOfChunks; ++i) {
-                long chunkAddress = mappedAddress + i * segmentSize;
-                // Align to first row start.
-                while (chunkAddress < endAddress && (s.getLongAt(chunkAddress++) & 0xFF) != '\n')
-                    ;
-                chunks[i] = Math.min(chunkAddress, endAddress);
-            }
-            chunks[numberOfChunks] = endAddress;
-            return chunks;
-        }
-    }
-
     private static class Scanner {
 
         private static final sun.misc.Unsafe UNSAFE = initUnsafe();
@@ -387,6 +470,10 @@ long getLongAt(long pos) {
             return UNSAFE.getLong(pos);
         }
 
+        long getLongAt(long pos, long[] array) {
+            return UNSAFE.getLong(array, pos + sun.misc.Unsafe.ARRAY_LONG_BASE_OFFSET);
+        }
+
         void setPos(long l) {
             this.pos = l;
         }

From d5854d65e60b381d66e746eef1fa31c1770aee7c Mon Sep 17 00:00:00 2001
From: Aleksei <971356+bytesfellow@users.noreply.github.com>
Date: Sun, 28 Jan 2024 18:06:18 +0100
Subject: [PATCH 169/268] Bytesfellow initial submittion (#619)

* Latest snapshot (#1)

preparing initial version

* Improved performance to 20seconds  (-9seconds from the previous version) (#2)

improved performance a bit

* Improved performance to 14 seconds (-6 seconds) (#3)

improved performance to 14 seconds

* sync branches (#4)

* initial commit

* some refactoring of methods

* some fixes for partitioning

* some fixes for partitioning

* fixed hacky getcode for utf8 bytes

* simplified getcode for partitioning

* temp solution with syncing

* temp solution with syncing

* new stream processing

* new stream processing

* some improvements

* cleaned stuff

* run configuration

* round buffer for the stream to pages

* not using compute since it's slower than straightforward get/put. using own byte array equals.

* using parallel gc

* avoid copying bytes when creating a station object

* formatting

* Copy less arrays. Improved performance to 12.7 seconds (-2 seconds) (#5)

* initial commit

* some refactoring of methods

* some fixes for partitioning

* some fixes for partitioning

* fixed hacky getcode for utf8 bytes

* simplified getcode for partitioning

* temp solution with syncing

* temp solution with syncing

* new stream processing

* new stream processing

* some improvements

* cleaned stuff

* run configuration

* round buffer for the stream to pages

* not using compute since it's slower than straightforward get/put. using own byte array equals.

* using parallel gc

* avoid copying bytes when creating a station object

* formatting

* some tuning to increase performance

* some tuning to increase performance

* avoid copying data; fast hashCode with slightly more collisions

* avoid copying data; fast hashCode with slightly more collisions

* cleanup (#6)

* tidy up
---
 calculate_average_bytesfellow.sh              |  19 +
 .../onebrc/CalculateAverage_bytesfellow.java  | 557 ++++++++++++++++++
 2 files changed, 576 insertions(+)
 create mode 100755 calculate_average_bytesfellow.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java

diff --git a/calculate_average_bytesfellow.sh b/calculate_average_bytesfellow.sh
new file mode 100755
index 000000000..eb21169e3
--- /dev/null
+++ b/calculate_average_bytesfellow.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xms12g -Xmx12g -XX:+AlwaysPreTouch -XX:+UseParallelGC -XX:-OmitStackTraceInFastThrow " 
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_bytesfellow
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java b/src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java
new file mode 100644
index 000000000..869b1950e
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java
@@ -0,0 +1,557 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Consumer;
+import java.util.stream.IntStream;
+
+public class CalculateAverage_bytesfellow {
+
+    public static final String CPU_CORES_1BRC_ENV_VARIABLE = "CPU_CORES_1BRC";
+    private static final byte Separator = ';';
+
+    private static final double SchedulerCpuRatio = 0.4;
+
+    private static final int availableCpu = System.getenv(CPU_CORES_1BRC_ENV_VARIABLE) != null ? Integer.parseInt(System.getenv(CPU_CORES_1BRC_ENV_VARIABLE))
+            : Runtime.getRuntime().availableProcessors();
+
+    private static final int SchedulerPoolSize = Math.max((int) (availableCpu * SchedulerCpuRatio), 1);
+    private static final int SchedulerQueueSize = Math.min(SchedulerPoolSize * 3, 12);
+    private static final int PartitionsNumber = Math.max((availableCpu - SchedulerPoolSize), 1);
+    private static final int PartitionExecutorQueueSize = 1000;
+
+    private static final int InputStreamBlockSize = 4096;
+    private static final int InputStreamReadBufferLen = 250 * InputStreamBlockSize;
+
+    static class Partition {
+
+        private static final AtomicInteger cntr = new AtomicInteger(-1);
+        private final Map<Station, MeasurementAggregator> partitionResult = new HashMap<>(10000); // as per requirement we have not more than 10K keys
+        private final AtomicInteger leftToExecute = new AtomicInteger(0);
+
+        private final String name = "partition-" + cntr.incrementAndGet();
+
+        private final Executor executor = new ThreadPoolExecutor(1, 1,
+                0L, TimeUnit.MILLISECONDS,
+                new LinkedBlockingQueue<>(PartitionExecutorQueueSize) { // some limit to avoid OOM
+                    @Override
+                    public boolean offer(Runnable runnable) {
+                        try {
+                            put(runnable); // block if limit was exceeded
+                        }
+                        catch (InterruptedException e) {
+                            throw new RuntimeException(e);
+                        }
+                        return true;
+                    }
+                }, r -> {
+                    Thread t = new Thread(r);
+                    t.setDaemon(true);
+                    t.setName(name);
+                    return t;
+                });
+
+        public void scheduleToProcess(byte[] slice, List<LineParams> lines) {
+
+            if (!lines.isEmpty()) {
+                leftToExecute.incrementAndGet();
+                executor.execute(
+                        () -> {
+                            for (int i = 0; i < lines.size(); i++) {
+                                LineParams lineParams = lines.get(i);
+
+                                Measurement measurement = getMeasurement(slice, lineParams);
+
+                                MeasurementAggregator measurementAggregator = partitionResult.get(measurement.station);
+                                if (measurementAggregator == null) {
+                                    partitionResult.put(new Station(measurement.station), new MeasurementAggregator().withMeasurement(measurement));
+                                }
+                                else {
+                                    measurementAggregator.withMeasurement(measurement);
+                                }
+                            }
+
+                            leftToExecute.decrementAndGet();
+                        });
+            }
+
+        }
+
+        public void materializeNames() {
+            partitionResult.keySet().forEach(Station::materializeName);
+        }
+
+        public Map<Station, MeasurementAggregator> getResult() {
+            return partitionResult;
+        }
+
+        public boolean allTasksCompleted() {
+            return leftToExecute.get() == 0;
+        }
+
+    }
+
+    record LineParams(int start, int length) {
+    }
+
+    static class Partitioner {
+
+        private final List<Partition> allPartitions = new ArrayList<>();
+        private final int partitionsSize;
+
+        AtomicInteger jobsScheduled = new AtomicInteger(0);
+
+        final Executor scheduler = new ThreadPoolExecutor(SchedulerPoolSize, SchedulerPoolSize,
+                0L, TimeUnit.MILLISECONDS,
+                new LinkedBlockingQueue<>(SchedulerQueueSize) { // some limit to avoid OOM
+
+                    @Override
+                    public Runnable take() throws InterruptedException {
+                        return super.take();
+                    }
+
+                    @Override
+                    public boolean offer(Runnable runnable) {
+                        try {
+                            put(runnable); // preventing unlimited scheduling due to possible OOM
+                        }
+                        catch (InterruptedException e) {
+                            throw new RuntimeException(e);
+                        }
+                        return true;
+                    }
+                }, r -> {
+                    Thread t = new Thread(r);
+                    t.setDaemon(true);
+                    t.setName("scheduler");
+                    return t;
+                });
+
+        Partitioner(int partitionsSize) {
+            IntStream.range(0, partitionsSize).forEach((i) -> allPartitions.add(new Partition()));
+            this.partitionsSize = partitionsSize;
+        }
+
+        private int partitionsSize() {
+            return partitionsSize;
+        }
+
+        void processSlice(byte[] slice) {
+
+            jobsScheduled.incrementAndGet();
+
+            scheduler.execute(() -> {
+                List<List<LineParams>> partitionedLines = new ArrayList<>(partitionsSize());
+                // allocate some capacity, assuming that on average lines are half of the max (407 bytes) length
+                IntStream.range(0, partitionsSize()).forEach((p) -> partitionedLines.add(new ArrayList<>(slice.length / 407 / 2)));
+
+                int start = 0;
+                int i = 0;
+                int startCharLen = 0;
+                while (i < slice.length) {
+
+                    if (slice[i] == '\n' || i == (slice.length - 1)) {
+
+                        int lineLength = i - start + (i == (slice.length - 1) ? 1 : 0);
+                        LineParams lineParams = new LineParams(start, lineLength);
+
+                        int partitioningCode = getPartitioningCode(slice, start, getUtf8CharNumberOfBytes(slice[start]));
+                        int partition = computePartition(partitioningCode);
+
+                        partitionedLines.get(partition).add(lineParams);
+                        start = i + 1;
+
+                    }
+
+                    i++;
+                }
+
+                processPartitionedBatch(slice, partitionedLines);
+
+                jobsScheduled.decrementAndGet();
+            });
+
+        }
+
+        private static byte[] getLine(byte[] slice, int lineLength, int start) {
+            byte[] line = new byte[lineLength];
+            System.arraycopy(slice, start, line, 0, lineLength);
+            return line;
+        }
+
+        private void processPartitionedBatch(byte[] slice, List<List<LineParams>> partitionedLines) {
+            for (int i = 0; i < partitionedLines.size(); i++) {
+                allPartitions.get(i).scheduleToProcess(slice, partitionedLines.get(i));
+            }
+        }
+
+        private int computePartition(int code) {
+            return Math.abs(code % partitionsSize());
+        }
+
+        private static int getPartitioningCode(byte[] line, int start, int utf8CharNumberOfBytes) {
+            // seems good enough
+            if (utf8CharNumberOfBytes == 4) {
+                return line[start] + line[start + 1] + line[start + 2] + line[start + 3];
+            }
+            else if (utf8CharNumberOfBytes == 3) {
+                return line[start] + line[start + 1] + line[start + 2];
+            }
+            else if (utf8CharNumberOfBytes == 2) {
+                return line[start] + line[start + 1];
+            }
+            else {
+                return line[start];
+            }
+        }
+
+        SortedMap<Station, MeasurementAggregator> getAllResults() {
+            allPartitions.parallelStream().forEach(Partition::materializeNames);
+            SortedMap<Station, MeasurementAggregator> result = new TreeMap<>();
+            allPartitions.forEach((p) -> result.putAll(p.getResult()));
+            return result;
+        }
+
+        public boolean allTasksCompleted() {
+            return allPartitions.stream().allMatch(Partition::allTasksCompleted);
+        }
+
+    }
+
+    private static final String FILE = "./measurements.txt";
+
+    public static class Station implements Comparable<Station> {
+
+        private final byte[] inputSlice;
+        private final int hash;
+
+        private final int startIdx;
+        private final int len;
+
+        private volatile String nameAsString;
+
+        public Station(byte[] inputSlice, int startIdx, int len) {
+            this.inputSlice = inputSlice;
+            this.startIdx = startIdx;
+            this.len = len;
+            this.hash = hashcodeFast();
+        }
+
+        public Station(Station from) {
+            this.inputSlice = new byte[from.len];
+            System.arraycopy(from.inputSlice, from.startIdx, this.inputSlice, 0, from.len);
+            this.startIdx = 0;
+            this.len = from.len;
+            this.hash = from.hash;
+        }
+
+        private int hashcodeFast() {
+            if (len == 0) {
+                return 0;
+            }
+            else if (len == 1) {
+                return inputSlice[startIdx] * 109;
+            }
+            else if (len == 2) {
+                return inputSlice[startIdx + 1] * 109 * 109 + inputSlice[startIdx];
+            }
+            else if (len == 3) {
+                return inputSlice[startIdx + 2] * 109 * 109 * 109 + inputSlice[startIdx + 1] * 109 * 109 + inputSlice[startIdx];
+            }
+            else {
+                return inputSlice[startIdx + 3] * 109 * 109 * 109 * 109 + inputSlice[startIdx + 2] * 109 * 109 * 109 + inputSlice[startIdx + 1] * 109 * 109
+                        + inputSlice[startIdx];
+            }
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o)
+                return true;
+            if (o == null || getClass() != o.getClass())
+                return false;
+
+            Station station = (Station) o;
+
+            if (len != station.len) {
+                return false;
+            }
+
+            return Arrays.equals(inputSlice, startIdx, startIdx + len, station.inputSlice, station.startIdx, station.startIdx + len);
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
+        }
+
+        @Override
+        public int compareTo(Station o) {
+            return materializeName().compareTo(o.materializeName()); //
+        }
+
+        public String materializeName() {
+            if (nameAsString == null) {
+                byte[] nameForMaterialization = new byte[len];
+                System.arraycopy(inputSlice, startIdx, nameForMaterialization, 0, len);
+                nameAsString = new String(nameForMaterialization, StandardCharsets.UTF_8);
+            }
+
+            return nameAsString;
+        }
+
+        @Override
+        public String toString() {
+            return materializeName();
+        }
+    }
+
+    private record Measurement(Station station, long value) {
+    }
+
+    private record ResultRow(long min, long sum, long count, long max) {
+
+        public String toString() {
+            return fakeDouble(min) + "/" + round((double) sum / (double) count / 10.0) + "/" + fakeDouble(max);
+        }
+
+        private String fakeDouble(long value) {
+            long positiveValue = value < 0 ? -value : value;
+            long wholePart = positiveValue / 10;
+            String positiveDouble = wholePart + "." + (positiveValue - wholePart * 10);
+
+
+            return (value < 0 ? "-" : "") + positiveDouble;
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+    }
+
+    public static class MeasurementAggregator {
+        private long min = Long.MAX_VALUE;
+        private long max = Long.MIN_VALUE;
+        private long sum;
+        private long count;
+
+        MeasurementAggregator withMeasurement(Measurement m) {
+
+            min = Math.min(min, m.value);
+            max = Math.max(max, m.value);
+            sum += m.value;
+            count++;
+
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            return new ResultRow(min, sum, count, max).toString();
+        }
+
+    }
+
+    private static long parseToLongIgnoringDecimalPoint(byte[] slice, int startIndex, int len) {
+        long value = 0;
+
+        int start = startIndex;
+        if (slice[startIndex] == '-') {
+            start = startIndex + 1;
+        }
+
+        for (int i = start; i < startIndex + len; i++) {
+            if (slice[i] == '.') {
+                continue;
+            }
+
+            if (i > 0) {
+                value = multipleByTen(value); // *= 10;
+            }
+            value += digitAsLong(slice, i);
+        }
+
+        return start > startIndex ? -value : value;
+    }
+
+    private static long multipleByTen(long value) {
+        return (value << 3) + (value << 1);
+    }
+
+    private static long digitAsLong(byte[] digits, int position) {
+        return (digits[position] - 48);
+    }
+
+    public static void main(String[] args) throws IOException {
+
+        Partitioner partitioner = new Partitioner(PartitionsNumber);
+
+        try (FileInputStream fileInputStream = new FileInputStream(FILE)) {
+            parseStreamWithBytes(fileInputStream, InputStreamReadBufferLen, partitioner::processSlice);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+
+        showResults(partitioner);
+
+    }
+
+    static void parseStreamWithBytes(InputStream inputStream, int bufferLen, Consumer<byte[]> sliceConsumer) throws IOException {
+
+        byte[] byteArray = new byte[bufferLen];
+        int offset = 0;
+        int lenToRead = bufferLen;
+
+        int readLen;
+
+        while ((readLen = inputStream.read(byteArray, offset, lenToRead)) > -1) {
+            if (readLen == 0) {
+                continue;
+            }
+
+            int traverseLen = Math.min(offset + readLen, bufferLen);
+            int lastLineBreakInSlicePosition = traverseLen;
+
+            for (int j = traverseLen - 1; j >= 0; j--) {
+                if (byteArray[j] == '\n') {
+                    lastLineBreakInSlicePosition = j + 1;
+                    break;
+                }
+            }
+
+            if (lastLineBreakInSlicePosition == traverseLen) {
+                // todo: end of line was not found in a slice?
+            }
+
+            int sliceSize = lastLineBreakInSlicePosition / SchedulerPoolSize;
+
+            int s = 0;
+
+            int j = Math.min(sliceSize, lastLineBreakInSlicePosition - 1);
+            while (s < lastLineBreakInSlicePosition && j < lastLineBreakInSlicePosition) {
+                if (byteArray[j] == '\n') {
+                    int len = j - s;
+                    byte[] slice = new byte[len];
+                    System.arraycopy(byteArray, s, slice, 0, len);
+                    sliceConsumer.accept(slice);
+
+                    s = j + 1;
+                    j = Math.min(s + sliceSize, lastLineBreakInSlicePosition - 1);
+
+                }
+                else {
+                    j++;
+                }
+            }
+
+            if (s < traverseLen && lastLineBreakInSlicePosition < traverseLen) {
+                // some tail left, carry it over to the next read
+                int len = traverseLen - s;
+                System.arraycopy(byteArray, s, byteArray, 0, len);
+                offset = len;
+                lenToRead = bufferLen - len;
+            }
+            else {
+                offset = 0;
+                lenToRead = bufferLen;
+            }
+        }
+    }
+
+    static int getUtf8CharNumberOfBytes(byte firstByteOfChar) {
+        int masked = firstByteOfChar & 0b11111000;
+        if (masked == 0b11110000) {
+            return 4;
+        }
+        else if (masked == 0b11100000) {
+            return 3;
+        }
+        else if (masked == 0b11000000) {
+            return 2;
+        }
+        else {
+            return 1;
+        }
+    }
+
+    static void showResults(Partitioner partitioner) {
+
+        CountDownLatch c = new CountDownLatch(1);
+        partitioner.scheduler.execute(() -> {
+
+            try {
+                // check if any unprocessed slices
+                while (partitioner.jobsScheduled.get() > 0) {
+                }
+
+                // check if anything left in partitions
+                while (!partitioner.allTasksCompleted()) {
+                }
+
+                SortedMap<Station, MeasurementAggregator> result = partitioner.getAllResults();
+                System.out.println(result); // output aggregated measurements according to the requirement
+            }
+            catch (Exception e) {
+                System.out.println(e);
+            }
+            c.countDown();
+        });
+
+        try {
+            c.await();
+        }
+        catch (InterruptedException e) {
+            throw new RuntimeException(e);
+        }
+
+    }
+
+    private static Measurement getMeasurement(byte[] slice, LineParams lineParams) {
+        int idx = lastIndexOfSeparator(slice, lineParams);
+        return new Measurement(
+                new Station(slice, lineParams.start, idx - lineParams.start),
+                parseToLongIgnoringDecimalPoint(slice, idx + 1, lineParams.start + lineParams.length - (idx + 1)));
+    }
+
+    private static int lastIndexOfSeparator(byte[] slice, LineParams lineParams) {
+        // hacky - we know that from the end of the line we have only
+        // single byte characters
+        // -2 is also hacky since we expect a particular format at the end of the line
+
+        int lastIdx = lineParams.start + lineParams.length() - 1;
+        if (slice[lastIdx - 3] == Separator) {
+            return lastIdx - 3;
+        }
+        else if (slice[lastIdx - 4] == Separator) {
+            return lastIdx - 4;
+        }
+        else if (slice[lastIdx - 5] == Separator) {
+            return lastIdx - 5;
+        }
+
+        return -1;
+    }
+
+}

From a33ed2181b0cc71882e009e5f445d36009e3b07c Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Mon, 29 Jan 2024 02:08:42 +0900
Subject: [PATCH 170/268] Use native type, remove lots of type conversions
 (#618)

* less type conversion, less string cast

* adjust some comments

* fixed format issue
---
 .../onebrc/CalculateAverage_abeobk.java       | 179 ++++++++++--------
 1 file changed, 99 insertions(+), 80 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index 06cbc1748..c08a9d86c 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -26,9 +26,9 @@
 import java.nio.file.StandardOpenOption;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.List;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.IntStream;
 
 import sun.misc.Unsafe;
@@ -39,7 +39,7 @@ public class CalculateAverage_abeobk {
 
     private static final String FILE = "./measurements.txt";
     private static final int BUCKET_SIZE = 1 << 16;
-    private static final int BUCKET_MASK = BUCKET_SIZE - 1;
+    private static final long BUCKET_MASK = BUCKET_SIZE - 1;
     private static final int MAX_STR_LEN = 100;
     private static final int MAX_STATIONS = 10000;
     private static final long CHUNK_SZ = 1 << 22; // 4MB chunk
@@ -56,9 +56,9 @@ public class CalculateAverage_abeobk {
             0xffffffffffffffffL, };
 
     private static AtomicInteger chunk_id = new AtomicInteger(0);
+    private static AtomicReference<Node[]> mapref = new AtomicReference<>(null);
     private static int chunk_cnt;
     private static long start_addr, end_addr;
-    private static Stat[][] all_res;
 
     private static final void debug(String s, Object... args) {
         System.out.println(String.format(s, args));
@@ -75,57 +75,49 @@ private static Unsafe initUnsafe() {
         }
     }
 
-    static class Stat {
-        Node node;
-        String key;
-
-        public final String toString() {
-            return (node.min / 10.0) + "/"
-                    + (Math.round(((double) node.sum / node.count)) / 10.0) + "/"
-                    + (node.max / 10.0);
-        }
-
-        Stat(Node n) {
-            node = n;
-            byte[] sbuf = new byte[MAX_STR_LEN];
-            long word = UNSAFE.getLong(n.addr);
-            long semipos_code = getSemiPosCode(word);
-            int keylen = 0;
-            while (semipos_code == 0) {
-                keylen += 8;
-                word = UNSAFE.getLong(n.addr + keylen);
-                semipos_code = getSemiPosCode(word);
-            }
-            keylen += Long.numberOfTrailingZeros(semipos_code) >>> 3;
-            UNSAFE.copyMemory(null, n.addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
-            key = new String(sbuf, 0, keylen, StandardCharsets.UTF_8);
-        }
-    }
-
+    // use native type, less conversion
     static class Node {
         long addr;
+        long hash;
         long word0;
         long tail;
         long sum;
+        long min, max;
+        int keylen;
         int count;
-        short min, max;
 
-        Node(long a, long t, short val) {
+        public final String toString() {
+            return (min / 10.0) + "/"
+                    + (Math.round(((double) sum / count)) / 10.0) + "/"
+                    + (max / 10.0);
+        }
+
+        final String key() {
+            byte[] sbuf = new byte[MAX_STR_LEN];
+            UNSAFE.copyMemory(null, addr, sbuf, Unsafe.ARRAY_BYTE_BASE_OFFSET, keylen);
+            return new String(sbuf, 0, (int) keylen, StandardCharsets.UTF_8);
+        }
+
+        Node(long a, long t, int kl, long h, long val) {
             addr = a;
             tail = t;
             sum = min = max = val;
             count = 1;
+            keylen = kl;
+            hash = h;
         }
 
-        Node(long a, long w0, long t, short val) {
+        Node(long a, long w0, long t, int kl, long h, long val) {
             addr = a;
             word0 = w0;
             tail = t;
             sum = min = max = val;
             count = 1;
+            keylen = kl;
+            hash = h;
         }
 
-        final void add(short val) {
+        final void add(long val) {
             sum += val;
             count++;
             if (val >= max) {
@@ -148,17 +140,28 @@ final void merge(Node other) {
             }
         }
 
-        final boolean contentEquals(long other_addr, long other_word0, long other_tail, int keylen) {
+        final boolean contentEquals(long other_addr, long other_word0, long other_tail, long kl) {
             if (word0 != other_word0 || tail != other_tail)
                 return false;
             // this is faster than comparision if key is short
             long xsum = 0;
-            int n = keylen & 0xF8;
-            for (int i = 8; i < n; i += 8) {
+            long n = kl & 0xF8;
+            for (long i = 8; i < n; i += 8) {
                 xsum |= (UNSAFE.getLong(addr + i) ^ UNSAFE.getLong(other_addr + i));
             }
             return xsum == 0;
         }
+
+        final boolean contentEquals(Node other) {
+            if (tail != other.tail)
+                return false;
+            long n = keylen & 0xF8;
+            for (long i = 0; i < n; i += 8) {
+                if (UNSAFE.getLong(addr + i) != UNSAFE.getLong(other.addr + i))
+                    return false;
+            }
+            return true;
+        }
     }
 
     // idea from royvanrijn
@@ -168,24 +171,24 @@ static final long getSemiPosCode(final long word) {
     }
 
     // speed/collision balance
-    static final int xxh32(long hash) {
+    static final long xxh32(long hash) {
         long h = hash * 37;
-        return (int) (h ^ (h >>> 29));
+        return (h ^ (h >>> 29));
     }
 
     // great idea from merykitty (Quan Anh Mai)
-    static final short parseNum(long num_word, int dot_pos) {
+    static final long parseNum(long num_word, int dot_pos) {
         int shift = 28 - dot_pos;
         long signed = (~num_word << 59) >> 63;
         long dsmask = ~(signed & 0xFF);
         long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
         long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-        return (short) ((abs_val ^ signed) - signed);
+        return ((abs_val ^ signed) - signed);
     }
 
     // Thread pool worker
     static final class Worker extends Thread {
-        final int thread_id;
+        final int thread_id; // for debug use only
 
         Worker(int i) {
             thread_id = i;
@@ -195,16 +198,15 @@ static final class Worker extends Thread {
         @Override
         public void run() {
             var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
-            int cnt = 0;
             int id;
             int cls = 0;
 
             // process in small chunk to maintain disk locality (artsiomkorzun trick)
-            // but keep going instead of merging
             while ((id = chunk_id.getAndIncrement()) < chunk_cnt) {
                 long addr = start_addr + id * CHUNK_SZ;
                 long end = Math.min(addr + CHUNK_SZ, end_addr);
-                // adjust start
+
+                // find start of line
                 if (id > 0) {
                     while (UNSAFE.getByte(addr++) != '\n')
                         ;
@@ -230,14 +232,14 @@ public void run() {
                         addr += (dot_pos >>> 3) + 3;
 
                         long tail = word0 & HASH_MASKS[semi_pos];
-                        int bucket = xxh32(tail) & BUCKET_MASK;
-                        short val = parseNum(num_word, dot_pos);
+                        long hash = xxh32(tail);
+                        int bucket = (int) (hash & BUCKET_MASK);
+                        long val = parseNum(num_word, dot_pos);
 
                         while (true) {
                             var node = map[bucket];
                             if (node == null) {
-                                map[bucket] = new Node(row_addr, tail, val);
-                                cnt++;
+                                map[bucket] = new Node(row_addr, tail, semi_pos, hash, val);
                                 break;
                             }
                             if (node.tail == tail) {
@@ -263,14 +265,14 @@ public void run() {
                         addr += (dot_pos >>> 3) + 3;
 
                         long tail = (word & HASH_MASKS[semi_pos]);
-                        int bucket = xxh32(word0 ^ tail) & BUCKET_MASK;
-                        short val = parseNum(num_word, dot_pos);
+                        long hash = xxh32(word0 ^ tail);
+                        int bucket = (int) (hash & BUCKET_MASK);
+                        long val = parseNum(num_word, dot_pos);
 
                         while (true) {
                             var node = map[bucket];
                             if (node == null) {
-                                map[bucket] = new Node(row_addr, word0, tail, val);
-                                cnt++;
+                                map[bucket] = new Node(row_addr, word0, tail, semi_pos + 8, hash, val);
                                 break;
                             }
                             if (node.word0 == word0 && node.tail == tail) {
@@ -295,20 +297,20 @@ public void run() {
 
                     int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
                     addr += semi_pos;
-                    int keylen = (int) (addr - row_addr);
+                    long keylen = addr - row_addr;
                     long num_word = UNSAFE.getLong(addr + 1);
                     int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
                     addr += (dot_pos >>> 3) + 4;
 
                     long tail = (word & HASH_MASKS[semi_pos]);
-                    int bucket = xxh32(hash ^ tail) & BUCKET_MASK;
-                    short val = parseNum(num_word, dot_pos);
+                    hash = xxh32(hash ^ tail);
+                    int bucket = (int) (hash & BUCKET_MASK);
+                    long val = parseNum(num_word, dot_pos);
 
                     while (true) {
                         var node = map[bucket];
                         if (node == null) {
-                            map[bucket] = new Node(row_addr, word0, tail, val);
-                            cnt++;
+                            map[bucket] = new Node(row_addr, word0, tail, (int) keylen, hash, val);
                             break;
                         }
                         if (node.contentEquals(row_addr, word0, tail, keylen)) {
@@ -322,18 +324,36 @@ public void run() {
                 }
             }
 
-            if (SHOW_ANALYSIS) {
-                debug("Thread %d collision = %d", thread_id, cls);
+            // merge is cheaper than string casting (artsiomkorzun)
+            while (!mapref.compareAndSet(null, map)) {
+                var other_map = mapref.getAndSet(null);
+                if (other_map != null) {
+                    for (int i = 0; i < other_map.length; i++) {
+                        var other = other_map[i];
+                        if (other == null)
+                            continue;
+                        int bucket = (int) (other.hash & BUCKET_MASK);
+                        while (true) {
+                            var node = map[bucket];
+                            if (node == null) {
+                                map[bucket] = other;
+                                break;
+                            }
+                            if (node.contentEquals(other)) {
+                                node.merge(other);
+                                break;
+                            }
+                            bucket++;
+                            if (SHOW_ANALYSIS)
+                                cls++;
+                        }
+                    }
+                }
             }
 
-            Stat[] stats = new Stat[cnt];
-            int i = 0;
-            for (var node : map) {
-                if (node != null) {
-                    stats[i++] = new Stat(node);
-                }
+            if (SHOW_ANALYSIS) {
+                debug("Thread %d collision = %d", thread_id, cls);
             }
-            all_res[thread_id] = stats;
         }
     }
 
@@ -366,23 +386,22 @@ public static void main(String[] args) throws InterruptedException, IOException
         // only use all cpus on large file
         int cpu_cnt = file_size < 1e6 ? 1 : CPU_CNT;
         chunk_cnt = (int) Math.ceilDiv(file_size, CHUNK_SZ);
-        all_res = new Stat[cpu_cnt][];
 
-        List<Worker> workers = IntStream.range(0, cpu_cnt).mapToObj(i -> new Worker(i)).toList();
-        for (var w : workers)
+        // spawn workers
+        for (var w : IntStream.range(0, cpu_cnt).mapToObj(i -> new Worker(i)).toList()) {
             w.join();
-
-        // collect all results
-        TreeMap<String, Stat> ms = new TreeMap<>();
-        for (var res : all_res) {
-            for (var s : res) {
-                var stat = ms.putIfAbsent(s.key, s);
-                if (stat != null)
-                    stat.node.merge(s.node);
-            }
         }
 
-        // print output
+        // collect results
+        TreeMap<String, Node> ms = new TreeMap<>();
+        for (var crr : mapref.get()) {
+            if (crr == null)
+                continue;
+            var prev = ms.putIfAbsent(crr.key(), crr);
+            if (prev != null)
+                prev.merge(crr);
+        }
+        // print result
         System.out.println(ms);
         System.out.close();
     }

From f5bddafaf735c812c6df5887e3969b0151e26eda Mon Sep 17 00:00:00 2001
From: Dimitris Karampinas <dkarampi@users.noreply.github.com>
Date: Sun, 28 Jan 2024 18:12:54 +0100
Subject: [PATCH 171/268] Dkarampi solution (#614)

* Simple multi-threaded version

* Format code

* Formatted code

* More formatting
---
 calculate_average_dkarampi.sh                 |  19 ++
 .../onebrc/CalculateAverage_dkarampi.java     | 260 ++++++++++++++++++
 2 files changed, 279 insertions(+)
 create mode 100755 calculate_average_dkarampi.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_dkarampi.java

diff --git a/calculate_average_dkarampi.sh b/calculate_average_dkarampi.sh
new file mode 100755
index 000000000..a6ce60945
--- /dev/null
+++ b/calculate_average_dkarampi.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-XX:+AlwaysCompileLoopMethods"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_dkarampi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_dkarampi.java b/src/main/java/dev/morling/onebrc/CalculateAverage_dkarampi.java
new file mode 100644
index 000000000..2b826eef6
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_dkarampi.java
@@ -0,0 +1,260 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
+
+public class CalculateAverage_dkarampi {
+    private static final String FILE = "./measurements.txt";
+    private static final int BUFFER_SIZE = (1 << 29); // 500mb
+    private static final int HT_SIZE = nextPowerOfTwo(10000);
+    private static final int NUM_THREADS = 8;
+    private final List<Station[]> stationHashTables = new ArrayList<>();
+
+    public static void main(String[] args) throws Exception {
+        new CalculateAverage_dkarampi().runFast();
+    }
+
+    private static double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+
+    private static boolean areEqual(byte[] a, int aLen, byte[] b, int bLen) {
+        if (aLen != bLen) {
+            return false;
+        }
+        for (byte i = 0; i < aLen; i++) {
+            if (a[i] != b[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public static int nextPowerOfTwo(int n) {
+        for (int i = 1; i < 32; i <<= 1) {
+            n |= n >> i;
+        }
+        return n + 1;
+    }
+
+    private void runFast() throws Exception {
+        createStationHashTables();
+        FileChannel channel = FileChannel.open(Path.of(FILE));
+
+        List<List<Buffer>> buffersList = new ArrayList<>();
+        for (int i = 0; i < NUM_THREADS; i++) {
+            buffersList.add(new ArrayList<>());
+        }
+
+        List<Buffer> buffers = createBuffers(channel);
+        for (int i = 0; i < buffers.size(); i++) {
+            buffersList.get(i % NUM_THREADS).add(buffers.get(i));
+        }
+
+        List<Task> tasks = new ArrayList<>();
+        for (int i = 0; i < NUM_THREADS; i++) {
+            tasks.add(new Task(stationHashTables.get(i), buffersList.get(i)));
+        }
+
+        ExecutorService executorService = Executors.newFixedThreadPool(NUM_THREADS);
+        Future<?>[] futures = new Future<?>[NUM_THREADS];
+        for (int i = 0; i < NUM_THREADS; i++) {
+            futures[i] = executorService.submit(tasks.get(i));
+        }
+
+        for (Future<?> future : futures) {
+            future.get();
+        }
+
+        sortAndPrint();
+
+        executorService.shutdown();
+        channel.close();
+    }
+
+    private void createStationHashTables() {
+        for (int i = 0; i < NUM_THREADS; i++) {
+            Station[] stationsHashTable = new Station[HT_SIZE];
+            for (int j = 0; j < HT_SIZE; j++) {
+                stationsHashTable[j] = new Station();
+            }
+            stationHashTables.add(stationsHashTable);
+        }
+    }
+
+    private List<Buffer> createBuffers(FileChannel channel) throws Exception {
+        List<Buffer> buffers = new ArrayList<>();
+        long size = channel.size();
+        int lastByte;
+        for (long offset = 0; offset < size; offset += lastByte + 1) {
+            long sizeToMap = Math.min(size - offset, BUFFER_SIZE);
+            MappedByteBuffer buffer = channel.map(READ_ONLY, offset, sizeToMap);
+            lastByte = (int) sizeToMap - 1;
+            while (buffer.get(lastByte) != '\n')
+                --lastByte;
+            buffers.add(new Buffer(buffer, lastByte + 1));
+        }
+        return buffers;
+    }
+
+    private void sortAndPrint() {
+        TreeMap<String, Station> sortedStations = new TreeMap<>();
+
+        for (Station[] stationHashTable : stationHashTables) {
+            for (Station station : stationHashTable) {
+                if (station.freq == 0) {
+                    continue;
+                }
+                String key = new String(station.name, 0, station.nameLen);
+                Station st = sortedStations.get(key);
+                if (st == null) {
+                    sortedStations.put(key, station);
+                }
+                else {
+                    st.min = Math.min(st.min, station.min);
+                    st.max = Math.max(st.max, station.max);
+                    st.sum += station.sum;
+                    st.freq += station.freq;
+                }
+            }
+        }
+
+        StringBuilder sb = new StringBuilder();
+        sb.append("{");
+        for (Map.Entry<String, Station> entry : sortedStations.entrySet()) {
+            String name = entry.getKey();
+            Station station = entry.getValue();
+            sb.append(name);
+            sb.append("=");
+            sb.append(round(station.min));
+            sb.append("/");
+            sb.append(round(round(station.sum) / station.freq));
+            sb.append("/");
+            sb.append(round(station.max));
+            sb.append(", ");
+        }
+        sb.delete(sb.length() - 2, sb.length());
+        sb.append("}");
+        System.out.println(sb);
+    }
+
+    private record Buffer(ByteBuffer byteBuffer, int length) {
+    }
+
+    private static class Station {
+        double sum;
+        double min = 100;
+        double max = -100;
+        int freq;
+        short nameLen;
+        byte[] name = new byte[nextPowerOfTwo(100)];
+    }
+
+    private record Task(Station[] stations, List<Buffer> buffers) implements Runnable {
+
+    @Override
+    public void run() {
+        for (Buffer buffer : buffers) {
+            process(buffer);
+        }
+    }
+
+    private void process(Buffer buffer) {
+        short nameLen = 0;
+        int hash = 5381;
+        int temperature;
+        byte[] name = new byte[100];
+
+        for (int i = 0; i < buffer.length; i++) {
+            byte c = buffer.byteBuffer.get(i);
+            if (c == ';') {
+                int sign = 1;
+                c = buffer.byteBuffer.get(++i);
+                if (c == '-') {
+                    sign = -1;
+                    c = buffer.byteBuffer.get(++i);
+                    temperature = (c - '0') * 10;
+                    c = buffer.byteBuffer.get(++i);
+                    if (c == '.') {
+                        c = buffer.byteBuffer.get(++i);
+                        temperature = temperature + c - '0';
+                    }
+                    else {
+                        temperature = temperature + c - '0';
+                        ++i; // dot
+                        c = buffer.byteBuffer.get(++i);
+                        temperature = temperature * 10 + c - '0';
+                    }
+                }
+                else {
+                    temperature = (c - '0') * 10;
+                    c = buffer.byteBuffer.get(++i);
+                    if (c == '.') {
+                        c = buffer.byteBuffer.get(++i);
+                        temperature = temperature + c - '0';
+                    }
+                    else {
+                        temperature = temperature + c - '0';
+                        ++i; // dot
+                        c = buffer.byteBuffer.get(++i);
+                        temperature = temperature * 10 + c - '0';
+                    }
+                }
+                hash = hash & 0x7FFFFFFF;
+                updateStations(hash, name, nameLen, sign * (double) temperature / 10);
+                ++i; // For '\n'
+                nameLen = 0;
+                hash = 5383;
+            }
+            else {
+                name[nameLen++] = c;
+                hash = ((hash << 5) + hash) + c;
+            }
+        }
+    }
+
+    private void updateStations(int hash, byte[] name, short nameLen, double temperature) {
+        int idx;
+        for (idx = hash % HT_SIZE; stations[idx].freq != 0; idx = (idx + 1) % HT_SIZE) {
+            if (areEqual(stations[idx].name, stations[idx].nameLen, name, nameLen)) {
+                stations[idx].sum += temperature;
+                stations[idx].min = Math.min(stations[idx].min, temperature);
+                stations[idx].max = Math.max(stations[idx].max, temperature);
+                ++stations[idx].freq;
+                return;
+            }
+        }
+        stations[idx].sum = temperature;
+        stations[idx].min = temperature;
+        stations[idx].max = temperature;
+        stations[idx].nameLen = nameLen;
+        System.arraycopy(name, 0, stations[idx].name, 0, nameLen);
+        stations[idx].freq = 1;
+    }
+}}

From f598d74594a49ca2fa4c3a1a2b87434e155adb9e Mon Sep 17 00:00:00 2001
From: Mahadev K <79390504+mahadev-k@users.noreply.github.com>
Date: Sun, 28 Jan 2024 22:56:44 +0530
Subject: [PATCH 172/268] Mahadev virtual thread 1brc (#611)

* Read file with multiple virtual threads and process chunks of file data in parallel.

* Updated logic to bucket every chunk of aggs into a vector and merge them into a TreeMap for printing.

* Virtual Thread / File Channels Impl.

* Renamed files with GHUsername.

* Added statement to get vals before updating.

* Added executable permission to the files.
---
 calculate_average_mahadev-k.sh                |  19 +++
 prepare_mahadev-k.sh                          |  20 +++
 .../onebrc/CalculateAverage_mahadev_k.java    | 152 ++++++++++++++++++
 3 files changed, 191 insertions(+)
 create mode 100755 calculate_average_mahadev-k.sh
 create mode 100755 prepare_mahadev-k.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_mahadev_k.java

diff --git a/calculate_average_mahadev-k.sh b/calculate_average_mahadev-k.sh
new file mode 100755
index 000000000..6f686be77
--- /dev/null
+++ b/calculate_average_mahadev-k.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_mahadev_k
diff --git a/prepare_mahadev-k.sh b/prepare_mahadev-k.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_mahadev-k.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_mahadev_k.java b/src/main/java/dev/morling/onebrc/CalculateAverage_mahadev_k.java
new file mode 100644
index 000000000..4d4ccd5b2
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_mahadev_k.java
@@ -0,0 +1,152 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.FileDescriptor;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.RandomAccessFile;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.StringTokenizer;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ThreadFactory;
+
+public class CalculateAverage_mahadev_k {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static Map<String, MeasurementAggregator> stationMap = new ConcurrentSkipListMap<>();
+
+    private static double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+
+    private static class MeasurementAggregator {
+        double minima = Double.POSITIVE_INFINITY, maxima = Double.NEGATIVE_INFINITY, total = 0, count = 0;
+
+        public synchronized void accept(double value) {
+            if (minima > value)
+                minima = value;
+            if (maxima < value)
+                maxima = value;
+            total += value;
+            count++;
+        }
+
+        public double min() {
+            return round(minima);
+        }
+
+        public double max() {
+            return round(maxima);
+        }
+
+        public double avg() {
+            return round((Math.round(total * 10.0) / 10.0) / count);
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        int chunkSize = args.length == 1 ? Integer.parseInt(args[0]) : 1_000_000;
+        readAndProcess(chunkSize);
+        print();
+    }
+
+    public static void readAndProcess(int chunkSize) {
+        final ThreadFactory factory = Thread.ofVirtual().name("routine-", 0).factory();
+
+        try (RandomAccessFile file = new RandomAccessFile(FILE, "r")) {
+            try (var executor = Executors.newThreadPerTaskExecutor(factory)) {
+
+                var channel = file.getChannel();
+                var size = channel.size();
+                long start = 0;
+                while (start <= size) {
+                    long end = start + chunkSize;
+                    String letter = "";
+                    do {
+                        end--;
+                        ByteBuffer buffer = ByteBuffer.allocate(1);
+                        channel.read(buffer, end);
+                        buffer.flip();
+                        letter = StandardCharsets.UTF_8.decode(buffer).toString();
+                    } while (!letter.equals("\n"));
+
+                    if (end < start)
+                        end = start + chunkSize;
+
+                    final long currentStart = start;
+                    final long currentEnd = end;
+                    executor.submit(() -> {
+                        ByteBuffer buffer = ByteBuffer.allocate((int) (currentEnd - currentStart + 1));
+                        try {
+                            channel.read(buffer, currentStart);
+                        }
+                        catch (IOException e) {
+                            e.printStackTrace();
+                        }
+                        buffer.flip();
+                        String data = StandardCharsets.UTF_8.decode(buffer).toString();
+                        processData(data);
+                    });
+                    start = end + 1;
+                }
+            }
+
+        }
+        catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public static void processData(String dataBlock) {
+        StringTokenizer tokenizer = new StringTokenizer(dataBlock, "\n");
+        while (tokenizer.hasMoreElements()) {
+            StringTokenizer tokens = new StringTokenizer(tokenizer.nextToken(), ";");
+            String station = tokens.nextToken();
+            double value = Double.parseDouble(tokens.nextToken());
+            processMinMaxMean(station, value);
+        }
+    }
+
+    private static void processMinMaxMean(String station, double temp) {
+        var values = stationMap.get(station);
+        if (values == null) {
+            values = new MeasurementAggregator();
+            stationMap.putIfAbsent(station, values);
+        }
+        values = stationMap.get(station);
+        values.accept(temp);
+    }
+
+    public static void print() throws UnsupportedEncodingException {
+        System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out), true, StandardCharsets.UTF_8));
+        System.out.print("{");
+        int i = stationMap.size();
+        for (var kv : stationMap.entrySet()) {
+            System.out.printf("%s=%s/%s/%s", kv.getKey(), kv.getValue().min(), kv.getValue().avg(), kv.getValue().max());
+            if (i > 1)
+                System.out.print(", ");
+            i--;
+        }
+        System.out.println("}");
+    }
+}
\ No newline at end of file

From 5dffd8e9b357851f32f7eb4a0d53d578d68857cc Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 18:33:07 +0100
Subject: [PATCH 173/268] Leaderboard update

---
 README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 87cbf8662..8a3185ae2 100644
--- a/README.md
+++ b/README.md
@@ -43,9 +43,9 @@ These are the results from running all entries into the challenge on eight cores
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:01.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 | 2 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.146 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+| 3 | 00:02.091 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+|   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 |   | 00:02.512 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
@@ -135,7 +135,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
 |   | 00:16.063 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java)| 21.0.1-open | [Marek Kohn](https://github.com/makohn) |  |
+|   | 00:16.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java)| 21.0.1-open | [Aleksei](https://github.com/bytesfellow) |  |
 |   | 00:16.953 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java)| 21.0.1-open | [Gaurav Anantrao Deshmukh](https://github.com/gauravdeshmukh) |  |
+|   | 00:17.046 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dkarampi.java)| 21.0.1-open | [Dimitris Karampinas](https://github.com/dkarampi) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java)| 21.0.1-open | [Farid](https://github.com/faridtmammadov) |  |
@@ -181,6 +183,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 01:14.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anandmattikopp.java)| 21.0.1-open | [twohardthings](https://github.com/anandmattikopp) |  |
 |   | 01:25.801 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ivanklaric.java)| 21.0.1-open | [ivanklaric](https://github.com/ivanklaric) |  |
 |   | 01:33.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnmathur.java)| 21.0.1-open | [Gaurav Mathur](https://github.com/gnmathur) |  |
+|   | 01:53.208 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mahadev-k.java)| java | [Mahadev K](https://github.com/mahadev-k) |  |
 |   | 01:56.607 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abfrmblr.java)| 21.0.1-open | [Abhilash](https://github.com/abfrmblr) |  |
 |   | 03:43.521 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yehwankim23.java)| 21.0.1-open | [김예환 Ye-Hwan Kim (Sam)](https://github.com/yehwankim23) |  |
 |   | 03:59.760 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fragmede.java)| 21.0.1-open | [Samson](https://github.com/fragmede) |  |

From baed56bcdbaabad34f3296dc6675fd3d14781b6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksey=20Shipil=C3=ABv?= <shipilev@amazon.de>
Date: Sun, 28 Jan 2024 18:36:22 +0100
Subject: [PATCH 174/268] Version 4 (#183)

---
 calculate_average_shipilev.sh                 |  26 +
 .../onebrc/CalculateAverage_shipilev.java     | 649 ++++++++++++++++++
 2 files changed, 675 insertions(+)
 create mode 100755 calculate_average_shipilev.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java

diff --git a/calculate_average_shipilev.sh b/calculate_average_shipilev.sh
new file mode 100755
index 000000000..5d9f6334c
--- /dev/null
+++ b/calculate_average_shipilev.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xms64m -Xmx64m -XX:+AlwaysPreTouch -XX:+UseTransparentHugePages
+-XX:-TieredCompilation -XX:CICompilerCount=2 -XX:-UseCountedLoopSafepoints -XX:+TrustFinalNonStaticFields
+--add-opens java.base/java.nio=ALL-UNNAMED --add-exports java.base/jdk.internal.ref=ALL-UNNAMED
+-XX:+UnlockDiagnosticVMOptions -XX:CompileCommand=quiet
+-XX:CompileCommand=dontinline,dev.morling.onebrc.CalculateAverage_shipilev\$ParsingTask::seqCompute
+-XX:CompileCommand=dontinline,dev.morling.onebrc.CalculateAverage_shipilev\$MeasurementsMap::updateSlow
+-XX:CompileCommand=inline,dev.morling.onebrc.CalculateAverage_shipilev::nameMatches
+-XX:CompileThreshold=2048"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_shipilev
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java b/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
new file mode 100644
index 000000000..49989864c
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
@@ -0,0 +1,649 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.lang.reflect.InaccessibleObjectException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
+import java.util.concurrent.*;
+import java.util.function.Supplier;
+
+public class CalculateAverage_shipilev {
+
+    // This might not be the fastest implementation one can do.
+    // When working on this implementation, I set the bar as follows.
+    //
+    // This implementation uses vanilla and standard Java as much as possible,
+    // without relying on Unsafe tricks and preview features. If and when
+    // those are used, they should be guarded by a feature flag. This would
+    // allow running vanilla implementation if anything goes off the rails.
+    //
+    // This implementation also covers the realistic scenario: the I/O is
+    // actually slow and jittery. To that end, making sure we can feed
+    // the parsing code under slow I/O is as important as getting the
+    // parsing fast. Current evaluation env keeps the input data on RAM disk,
+    // which hides this important part.
+
+    // ========================= Tunables =========================
+
+    // Workload data file.
+    private static final String FILE = "./measurements.txt";
+
+    // Max distance to search for line separator when scanning for line
+    // boundaries. 100 bytes name should fit into this power-of-two buffer.
+    // Should probably never change.
+    private static final int MAX_LINE_LENGTH = 128;
+
+    // Fixed size of the measurements map. Must be the power of two. Should
+    // be large enough to accomodate all the station names. Rules say there are
+    // 10K station names max, so anything >> 16K works well.
+    private static final int MAP_SIZE = 1 << 15;
+
+    // The largest mmap-ed chunk. This can be be Integer.MAX_VALUE, but
+    // it is normally tuned down to seed the workers with smaller mmap regions
+    // more efficiently.
+    private static final int MMAP_CHUNK_SIZE = Integer.MAX_VALUE / 32;
+
+    // The largest slice as unit of work, processed serially by a worker.
+    // Set it too low and there would be more tasks and less batching, but
+    // more parallelism. Set it too high, and the reverse would be true.
+    private static final int UNIT_SLICE_SIZE = 4 * 1024 * 1024;
+
+    // Employ direct unmapping techniques to alleviate the cost of system
+    // unmmapping on process termination. This matters for very short runs
+    // on highly parallel machines. This unfortunately calls into private
+    // methods of buffers themselves. If not available on target JVM, the
+    // feature would automatically turn off.
+    private static final boolean DIRECT_UNMMAPS = true;
+
+    // ========================= Storage =========================
+
+    // Thread-local measurement maps, each thread gets one.
+    // Even though crude, avoid lambdas here to alleviate startup costs.
+    private static final ThreadLocal<MeasurementsMap> MAPS = ThreadLocal.withInitial(new Supplier<>() {
+        @Override
+        public MeasurementsMap get() {
+            MeasurementsMap m = new MeasurementsMap();
+            ALL_MAPS.add(m);
+            return m;
+        }
+    });
+
+    // After worker threads finish, the data is available here. One just needs
+    // to merge it a little.
+    private static final ConcurrentLinkedQueue<MeasurementsMap> ALL_MAPS = new ConcurrentLinkedQueue<>();
+
+    // Releasable mmaped buffers that workers are done with. These can be un-mapped
+    // in background. Part of the protocol to shutdown the background activity is to
+    // issue the poison pill.
+    private static final LinkedBlockingQueue<ByteBuffer> RELEASABLE_BUFFERS = new LinkedBlockingQueue<>();
+    private static final ByteBuffer RELEASABLE_BUFFER_POISON_PILL = ByteBuffer.allocate(1);
+
+    // ========================= MEATY GRITTY PARTS: PARSE AND AGGREGATE =========================
+
+    // Little helper method to compare the array with given bytebuffer range.
+    public static boolean nameMatches(Bucket bucket, ByteBuffer cand, int begin, int end) {
+        byte[] orig = bucket.name;
+        int origLen = orig.length;
+        int candLen = end - begin;
+        if (origLen != candLen) {
+            return false;
+        }
+
+        // Check the tails first, to simplify the matches.
+        if (origLen >= 8) {
+            if (bucket.tail1 != cand.getLong(end - 8)) {
+                return false;
+            }
+            if (origLen >= 16) {
+                if (bucket.tail2 != cand.getLong(end - 16)) {
+                    return false;
+                }
+                origLen -= 16;
+            }
+            else {
+                origLen -= 8;
+            }
+        }
+
+        // Check the rest.
+        for (int i = 0; i < origLen; i++) {
+            if (orig[i] != cand.get(begin + i)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public static final class Bucket {
+        // Raw station name, its hash, and tails.
+        public final byte[] name;
+        public final int hash;
+        public final long tail1, tail2;
+
+        // Temperature values, in 10x scale.
+        public long sum;
+        public int count;
+        public int min;
+        public int max;
+
+        public Bucket(byte[] name, long tail1, long tail2, int hash, int temp) {
+            this.name = name;
+            this.tail1 = tail1;
+            this.tail2 = tail2;
+            this.hash = hash;
+            this.sum = temp;
+            this.count = 1;
+            this.min = temp;
+            this.max = temp;
+        }
+
+        public void merge(int value) {
+            sum += value;
+            count++;
+            if (value < min) {
+                min = value;
+            }
+            if (value > max) {
+                max = value;
+            }
+        }
+
+        public void merge(Bucket s) {
+            sum += s.sum;
+            count += s.count;
+            min = Math.min(min, s.min);
+            max = Math.max(max, s.max);
+        }
+
+        public Row toRow() {
+            return new Row(
+                    new String(name),
+                    Math.round((double) min) / 10.0,
+                    Math.round((double) sum / count) / 10.0,
+                    Math.round((double) max) / 10.0);
+        }
+    }
+
+    // Quick and dirty linear-probing hash map. YOLO.
+    public static final class MeasurementsMap {
+        // Individual map buckets. Inlining these straight into map complicates
+        // the implementation without the sensible performance improvement.
+        // The map is likely sparse, so whatever footprint loss we have due to
+        // Bucket headers we gain by allocating the buckets lazily. The memory
+        // dereference costs are still high in both cases. The additional benefit
+        // for explicit fields in Bucket is that we only need to pay for a single
+        // null-check on bucket instead of multiple range-checks on inlined array.
+        private final Bucket[] buckets = new Bucket[MAP_SIZE];
+
+        // Fast path is inlined in seqCompute. This is a slow-path that is taken
+        // when something is off. We normally do not enter here.
+        private void updateSlow(ByteBuffer name, int begin, int end, int hash, int temp) {
+            int idx = hash & (MAP_SIZE - 1);
+
+            while (true) {
+                Bucket cur = buckets[idx];
+                if (cur == null) {
+                    // No bucket yet, lucky us. Lookup the name and create the bucket with it.
+                    // We are checking the names on hot-path. Therefore, it is convenient
+                    // to keep allocation for names near the buckets.
+                    int len = end - begin;
+                    byte[] copy = new byte[len];
+                    name.get(begin, copy, 0, len);
+
+                    // Also pick up any tail to simplify future matches.
+                    long tail1 = (len < 8) ? 0 : name.getLong(begin + len - 8);
+                    long tail2 = (len < 16) ? 0 : name.getLong(begin + len - 16);
+
+                    buckets[idx] = new Bucket(copy, tail1, tail2, hash, temp);
+                    return;
+                }
+                else if ((cur.hash == hash) && nameMatches(cur, name, begin, end)) {
+                    // Same as bucket fastpath. Check for collision by checking the full hash
+                    // first (since the index is truncated by map size), and then the exact name.
+                    cur.merge(temp);
+                    return;
+                }
+                else {
+                    // No dice. Keep searching.
+                    idx = (idx + 1) & (MAP_SIZE - 1);
+                }
+            }
+        }
+
+        // Same as update(), really, but for merging maps. See the comments there.
+        public void merge(MeasurementsMap otherMap) {
+            for (Bucket other : otherMap.buckets) {
+                if (other == null)
+                    continue;
+                int idx = other.hash & (MAP_SIZE - 1);
+                while (true) {
+                    Bucket cur = buckets[idx];
+                    if (cur == null) {
+                        buckets[idx] = other;
+                        break;
+                    }
+                    else if ((cur.hash == other.hash) && Arrays.equals(cur.name, other.name)) {
+                        cur.merge(other);
+                        break;
+                    }
+                    else {
+                        idx = (idx + 1) & (MAP_SIZE - 1);
+                    }
+                }
+            }
+        }
+
+        // Convert from internal representation to the rows.
+        // This does several major things: filters away null-s, instantates full Strings,
+        // and computes stats.
+        public int fill(Row[] rows) {
+            int idx = 0;
+            for (Bucket bucket : buckets) {
+                if (bucket == null)
+                    continue;
+                rows[idx++] = bucket.toRow();
+            }
+            return idx;
+        }
+    }
+
+    // The heavy-weight, where most of the magic happens. This is not a usual
+    // RecursiveAction, but rather a CountedCompleter in order to be more robust
+    // in presence of I/O stalls and other scheduling irregularities.
+    public static final class ParsingTask extends CountedCompleter<Void> {
+        private final MappedByteBuffer mappedBuf;
+        private final ByteBuffer buf;
+
+        public ParsingTask(CountedCompleter<Void> p, MappedByteBuffer mappedBuf) {
+            super(p);
+            this.mappedBuf = mappedBuf;
+            this.buf = mappedBuf;
+        }
+
+        public ParsingTask(CountedCompleter<Void> p, ByteBuffer buf) {
+            super(p);
+            this.mappedBuf = null;
+            this.buf = buf;
+        }
+
+        @Override
+        public void compute() {
+            try {
+                internalCompute();
+            }
+            catch (Exception e) {
+                // Meh, YOLO.
+                e.printStackTrace();
+                throw new IllegalStateException("Internal error", e);
+            }
+        }
+
+        @Override
+        public void onCompletion(CountedCompleter<?> caller) {
+            if (DIRECT_UNMMAPS && (mappedBuf != null)) {
+                RELEASABLE_BUFFERS.offer(mappedBuf);
+            }
+        }
+
+        private void internalCompute() throws Exception {
+            int len = buf.limit();
+            if (len > UNIT_SLICE_SIZE) {
+                // Split in half.
+                int mid = len / 2;
+
+                // Figure out the boundary that does not split the line.
+                int w = mid + MAX_LINE_LENGTH;
+                while (buf.get(w - 1) != '\n') {
+                    w--;
+                }
+                mid = w;
+
+                // Fork out! The stack depth would be shallow enough for us to
+                // execute one of the computations directly.
+                // FJP API: Tell there is a pending task.
+                setPendingCount(1);
+                new ParsingTask(this, buf.slice(0, mid)).fork();
+
+                // The stack depth would be shallow enough for us to
+                // execute one of the computations directly.
+                new ParsingTask(this, buf.slice(mid, len - mid)).compute();
+            }
+            else {
+                // The call to seqCompute would normally be non-inlined.
+                // Do setup stuff here to save inlining budget.
+                MeasurementsMap map = MAPS.get();
+
+                // Force the order we need for bit extraction to work. This fits
+                // most of the hardware very well without introducing platform
+                // dependencies.
+                buf.order(ByteOrder.LITTLE_ENDIAN);
+
+                // Go!
+                seqCompute(map, buf, len);
+
+                // FJP API: Notify that this task have completed.
+                tryComplete();
+            }
+        }
+
+        private void seqCompute(MeasurementsMap map, ByteBuffer origSlice, int length) throws IOException {
+            Bucket[] buckets = map.buckets;
+
+            // Slice up our slice! Pecular note here: this instantiates a full new buffer
+            // object, which allows compiler to trust its fields more thoroughly.
+            ByteBuffer slice = origSlice.slice();
+
+            // Do the same endianness as the original slice.
+            slice.order(ByteOrder.LITTLE_ENDIAN);
+
+            // Touch the buffer once to let the common checks to fire once for this slice.
+            slice.get(0);
+
+            int idx = 0;
+            while (idx < length) {
+                // Parse out the name, computing the hash on the fly.
+                // Reading with ints allows us to guarantee that read would always
+                // be in bounds, since the temperature+EOL is at least 4 bytes
+                // long themselves. This implementation prefers simplicity over
+                // advanced tricks like SWAR.
+                int nameBegin = idx;
+                int nameHash = 0;
+
+                outer: while (true) {
+                    int intName = slice.getInt(idx);
+                    for (int c = 0; c < 4; c++) {
+                        int b = (intName >> (c << 3)) & 0xFF;
+                        if (b == ';') {
+                            idx += c + 1;
+                            break outer;
+                        }
+                        nameHash ^= b * 82805;
+                    }
+                    idx += 4;
+                }
+                int nameEnd = idx - 1;
+
+                // Parse out the temperature. The rules specify temperatures
+                // are within -99.9..99.9. We implicitly look ahead for
+                // negative sign and carry the negative multiplier, if found.
+                // After that, we just need to reconstruct the temperature from
+                // two or three digits. The aggregation code expects temperatures
+                // at 10x scale.
+
+                int intTemp = slice.getInt(idx);
+
+                int neg = 1;
+                if ((intTemp & 0xFF) == '-') {
+                    // Unlucky, there is a sign. Record it, shift one byte and read
+                    // the remaining digit again. Surprisingly, doing a second read
+                    // is not worse than reading into long and trying to do bit
+                    // shifts on it.
+                    neg = -1;
+                    intTemp >>>= 8;
+                    intTemp |= slice.get(idx + 4) << 24;
+                    idx++;
+                }
+
+                // Since the sign is consumed, we are only left with two cases:
+                int temp = 0;
+                if ((intTemp >>> 24) == '\n') {
+                    // EOL-digitL-point-digitH
+                    temp = (((intTemp & 0xFF)) - '0') * 10 +
+                            ((intTemp >> 16) & 0xFF) - '0';
+                    idx += 4;
+                }
+                else {
+                    // digitL-point-digitH-digitHH
+                    temp = (((intTemp & 0xFF)) - '0') * 100 +
+                            (((intTemp >> 8) & 0xFF) - '0') * 10 +
+                            (((intTemp >>> 24)) - '0');
+                    idx += 5;
+                }
+                temp *= neg;
+
+                // Time to update!
+                Bucket bucket = buckets[nameHash & (MAP_SIZE - 1)];
+                if ((bucket != null) && (nameHash == bucket.hash) && nameMatches(bucket, slice, nameBegin, nameEnd)) {
+                    // Lucky fast path, existing bucket hit. Most of the time we complete here.
+                    bucket.merge(temp);
+                }
+                else {
+                    // Unlucky, slow path. The method would not be inlined, it is useful
+                    // to give it the original slice, so that we keep current hot slice
+                    // metadata provably unmodified.
+                    map.updateSlow(origSlice, nameBegin, nameEnd, nameHash, temp);
+                }
+            }
+        }
+    }
+
+    // Fork out the initial tasks. We would normally just fork out one large
+    // task and let it split, but unfortunately buffer API does not allow us
+    // "long" start-s and length-s. So we have to chunk at least by mmap-ed
+    // size first. It is a CountedCompleter for the same reason ParsingTask is.
+    // This also gives us a very nice opportunity to complete the work on
+    // a given mmap slice, while there is still other work to do. This allows
+    // us to unmap slices on the go.
+    public static final class RootTask extends CountedCompleter<Void> {
+        public RootTask(CountedCompleter<Void> parent) {
+            super(parent);
+        }
+
+        @Override
+        public void compute() {
+            try {
+                internalCompute();
+            }
+            catch (Exception e) {
+                // Meh, YOLO.
+                e.printStackTrace();
+                throw new IllegalStateException("Internal error", e);
+            }
+        }
+
+        private void internalCompute() throws Exception {
+            ByteBuffer buf = ByteBuffer.allocateDirect(MAX_LINE_LENGTH);
+            FileChannel fc = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
+
+            long start = 0;
+            long size = fc.size();
+            while (start < size) {
+                long end = Math.min(size, start + MMAP_CHUNK_SIZE);
+
+                // Read a little chunk into a little buffer.
+                long minEnd = Math.max(0, end - MAX_LINE_LENGTH);
+                buf.rewind();
+                fc.read(buf, minEnd);
+
+                // Figure out the boundary that does not split the line.
+                int w = MAX_LINE_LENGTH;
+                while (buf.get(w - 1) != '\n') {
+                    w--;
+                }
+                end = minEnd + w;
+
+                // Fork out the large slice
+                long len = end - start;
+                MappedByteBuffer slice = fc.map(FileChannel.MapMode.READ_ONLY, start, len);
+                start += len;
+
+                // FJP API: Announce we have a pending task before forking.
+                addToPendingCount(1);
+
+                // ...and fork it
+                new ParsingTask(this, slice).fork();
+            }
+
+            // All mappings are up, can close the channel now.
+            fc.close();
+
+            // FJP API: We have finished, try to complete the whole task tree.
+            propagateCompletion();
+        }
+
+        @Override
+        public void onCompletion(CountedCompleter<?> caller) {
+            try {
+                RELEASABLE_BUFFERS.put(RELEASABLE_BUFFER_POISON_PILL);
+            }
+            catch (Exception e) {
+                throw new IllegalStateException(e);
+            }
+        }
+    }
+
+    // ========================= Invocation =========================
+
+    public static void main(String[] args) throws Exception {
+        // This little line carries the whole world
+        new RootTask(null).fork();
+
+        // While the root task is working, prepare what we need for the
+        // end of the run. Go and try to report something to prepare the
+        // reporting code for execution.
+        MeasurementsMap map = new MeasurementsMap();
+        Row[] rows = new Row[MAP_SIZE];
+        StringBuilder sb = new StringBuilder(16384);
+
+        report(map, rows, sb);
+        sb.setLength(0);
+
+        // Nothing else is left to do preparation-wise. Now see if we can clean up
+        // buffers that tasks do not need anymore. The root task would communicate
+        // that it is done by giving us a poison pill.
+        ByteBuffer buf;
+        while ((buf = RELEASABLE_BUFFERS.take()) != RELEASABLE_BUFFER_POISON_PILL) {
+            DirectUnmaps.invokeCleaner(buf);
+        }
+
+        // All done. Merge results from thread-local maps...
+        for (MeasurementsMap m : ALL_MAPS) {
+            map.merge(m);
+        }
+
+        // ...and truly report them
+        System.out.println(report(map, rows, sb));
+    }
+
+    private static String report(MeasurementsMap map, Row[] rows, StringBuilder sb) {
+        int rowCount = map.fill(rows);
+        Arrays.sort(rows, 0, rowCount);
+
+        sb.append("{");
+        boolean first = true;
+        for (int c = 0; c < rowCount; c++) {
+            if (c != 0) {
+                sb.append(", ");
+            }
+            rows[c].printTo(sb);
+        }
+        sb.append("}");
+        return sb.toString();
+    }
+
+    // ========================= Reporting =========================
+
+    private static final class Row implements Comparable<Row> {
+        private final String name;
+        private final double min;
+        private final double max;
+        private final double avg;
+
+        public Row(String name, double min, double avg, double max) {
+            this.name = name;
+            this.min = min;
+            this.max = max;
+            this.avg = avg;
+        }
+
+        @Override
+        public int compareTo(Row o) {
+            return name.compareTo(o.name);
+        }
+
+        public void printTo(StringBuilder sb) {
+            sb.append(name);
+            sb.append("=");
+            sb.append(min);
+            sb.append("/");
+            sb.append(avg);
+            sb.append("/");
+            sb.append(max);
+        }
+    }
+
+    // ========================= Utils =========================
+
+    // Tries to figure out if calling Cleaner directly on the DirectByteBuffer
+    // is possible. If this fails, we still go on.
+    public static class DirectUnmaps {
+        private static final Method METHOD_GET_CLEANER;
+        private static final Method METHOD_CLEANER_CLEAN;
+
+        static Method getCleaner() {
+            try {
+                ByteBuffer dbb = ByteBuffer.allocateDirect(1);
+                Method m = dbb.getClass().getMethod("cleaner");
+                m.setAccessible(true);
+                return m;
+            }
+            catch (NoSuchMethodException | InaccessibleObjectException e) {
+                return null;
+            }
+        }
+
+        static Method getCleanerClean(Method methodGetCleaner) {
+            try {
+                ByteBuffer dbb = ByteBuffer.allocateDirect(1);
+                Object cleaner = methodGetCleaner.invoke(dbb);
+                Method m = cleaner.getClass().getMethod("clean");
+                m.setAccessible(true);
+                m.invoke(cleaner);
+                return m;
+            }
+            catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InaccessibleObjectException e) {
+                return null;
+            }
+        }
+
+        static {
+            METHOD_GET_CLEANER = getCleaner();
+            METHOD_CLEANER_CLEAN = (METHOD_GET_CLEANER != null) ? getCleanerClean(METHOD_GET_CLEANER) : null;
+        }
+
+        public static void invokeCleaner(ByteBuffer bb) {
+            if (METHOD_GET_CLEANER == null || METHOD_CLEANER_CLEAN == null) {
+                return;
+            }
+            try {
+                METHOD_CLEANER_CLEAN.invoke(METHOD_GET_CLEANER.invoke(bb));
+            }
+            catch (InvocationTargetException | IllegalAccessException e) {
+                throw new IllegalStateException("Cannot happen at this point", e);
+            }
+        }
+    }
+
+}

From 2bb74fe071ed3489d315c58b9bb53c29c205e373 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 18:37:11 +0100
Subject: [PATCH 175/268] Leaderboard update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8a3185ae2..2c2c8f8c3 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
 |   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:05.709 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
+|   | 00:05.850 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:05.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |

From f98304e4a71918b5723e5f4e9c330f8c9372e9b8 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 20:43:16 +0100
Subject: [PATCH 176/268] Fixing leaderboard entries with difference between
 user name and class name

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 2c2c8f8c3..2d8d1dbce 100644
--- a/README.md
+++ b/README.md
@@ -46,13 +46,13 @@ These are the results from running all entries into the challenge on eight cores
 | 3 | 00:02.091 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-|   | 00:02.512 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
+|   | 00:02.512 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
-|   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
+|   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
@@ -68,7 +68,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathan-aotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary |
+|   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathan-aotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
@@ -101,7 +101,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.489 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnabyl.java)| 21.0.1-graal | [Bang NGUYEN](https://github.com/gnabyl) |  |
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) | uses Unsafe |
 |   | 00:08.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [Adrià Cabeza](https://github.com/adriacabeza) |  |
-|   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa-keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) |  |
+|   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) | uses Unsafe |
 |   | 00:08.752 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
 |   | 00:08.892 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fatroom.java)| 21.0.1-open | [Roman Romanchuk](https://github.com/fatroom) |  |
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
@@ -120,7 +120,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.406 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java)| 21.0.1-graal | [gabrielfoo](https://github.com/gabrielfoo) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
-|   | 00:11.505 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dmitry-midokura.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) |  |
+|   | 00:11.505 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) | uses Unsafe |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
@@ -171,7 +171,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:42.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_javamak.java)| 21.0.1-open | [javamak](https://github.com/javamak) |  |
 |   | 00:46.597 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_maeda6uiui.java)| 21.0.1-open | [Maeda-san](https://github.com/maeda6uiui) |  |
 |   | 00:58.811 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Ujjwalbharti.java)| 21.0.1-open | [Ujjwal Bharti](https://github.com/Ujjwalbharti) |  |
-|   | 01:05.094 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mudit-saxena.java)| 21.0.1-open | [Mudit Saxena](https://github.com/mudit-saxena) |  |
+|   | 01:05.094 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_muditsaxena.java)| 21.0.1-open | [Mudit Saxena](https://github.com/mudit-saxena) |  |
 |   | 01:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dqhieuu.java)| 21.0.1-graal | [Hieu Dao Quang](https://github.com/dqhieuu) |  |
 |   | 01:06.790 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_khmarbaise.java)| 21.0.1-open | [Karl Heinz Marbaise](https://github.com/khmarbaise) |  |
 |   | 01:06.944 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_santanu.java)| 21.0.1-open | [santanu](https://github.com/santanu) |  |
@@ -184,7 +184,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 01:14.815 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anandmattikopp.java)| 21.0.1-open | [twohardthings](https://github.com/anandmattikopp) |  |
 |   | 01:25.801 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ivanklaric.java)| 21.0.1-open | [ivanklaric](https://github.com/ivanklaric) |  |
 |   | 01:33.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gnmathur.java)| 21.0.1-open | [Gaurav Mathur](https://github.com/gnmathur) |  |
-|   | 01:53.208 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mahadev-k.java)| java | [Mahadev K](https://github.com/mahadev-k) |  |
+|   | 01:53.208 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mahadev_k.java)| java | [Mahadev K](https://github.com/mahadev-k) |  |
 |   | 01:56.607 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abfrmblr.java)| 21.0.1-open | [Abhilash](https://github.com/abfrmblr) |  |
 |   | 03:43.521 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yehwankim23.java)| 21.0.1-open | [김예환 Ye-Hwan Kim (Sam)](https://github.com/yehwankim23) |  |
 |   | 03:59.760 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fragmede.java)| 21.0.1-open | [Samson](https://github.com/fragmede) |  |

From bb9bc68e4111f5a67ff3931d634bd8815ad9c97a Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 21:55:52 +0100
Subject: [PATCH 177/268] Fixing link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2d8d1dbce..4b1113776 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathan-aotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
+|   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |

From 3a790c99b9d8c4c4ede49f73503e8ba3bf0a64aa Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Sun, 28 Jan 2024 21:39:17 +0000
Subject: [PATCH 178/268] Reduce preferred vector size (#622)

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 .../onebrc/CalculateAverage_ianopolousfast.java        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index 28f62a4dd..417abcfbe 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -49,9 +49,9 @@ public class CalculateAverage_ianopolousfast {
     public static final int MAX_LINE_LENGTH = 107;
     public static final int MAX_STATIONS = 1 << 14;
     private static final OfLong LONG_LAYOUT = JAVA_LONG_UNALIGNED.withOrder(ByteOrder.BIG_ENDIAN);
-    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED.length() >= 32
-            ? ByteVector.SPECIES_256
-            : ByteVector.SPECIES_128;
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED.length() >= 16
+            ? ByteVector.SPECIES_128
+            : ByteVector.SPECIES_64;
 
     public static void main(String[] args) throws Exception {
         Arena arena = Arena.global();
@@ -132,7 +132,7 @@ public static Stat parseStation(long lineStart, MemorySegment buffer, Stat[] sta
         if (keySize <= 8) {
             first8 = maskHighBytes(first8, keySize & 0x07);
         }
-        else if (keySize <= 16) {
+        else if (keySize < 16) {
             second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
         }
         else if (keySize == BYTE_SPECIES.vectorByteSize()) {
@@ -182,7 +182,7 @@ private static long parseLine(long lineStart, MemorySegment buffer, Stat[] stati
         if (keySize <= 8) {
             first8 = maskHighBytes(first8, keySize & 0x07);
         }
-        else if (keySize <= 16) {
+        else if (keySize < 16) {
             second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
         }
         else if (keySize == BYTE_SPECIES.vectorByteSize()) {

From 9da1660ba5b5ee4b5fa78d12594ad2c652f0e3fc Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Sun, 28 Jan 2024 22:43:53 +0100
Subject: [PATCH 179/268] jerrinot - running out of ideas (#631)

* another shameless copycat from thomas: less safepoints

* I have no idea what I am doing
---
 prepare_jerrinot.sh                           |  4 +--
 .../onebrc/CalculateAverage_jerrinot.java     | 33 ++++++++-----------
 2 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/prepare_jerrinot.sh b/prepare_jerrinot.sh
index c36cae32e..58aac6bbd 100755
--- a/prepare_jerrinot.sh
+++ b/prepare_jerrinot.sh
@@ -18,9 +18,7 @@
 source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.2-graal 1>&2
 
-# ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_jerrinot_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_jerrinot"
-    # Use -H:MethodFilter=CalculateAverage_jerrinot.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:-GenLoopSafepoints -H:InlineAllBonus=10 --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_jerrinot"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_jerrinot_image dev.morling.onebrc.CalculateAverage_jerrinot
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index 36e3182e2..df5defe71 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -214,7 +214,7 @@ private static class Processor implements Runnable {
         private long slowMap;
         private long slowMapNamesPtr;
         private long slowMapNamesLo;
-        private long fastMap;
+        // private long fastMap;
         private long cursorA;
         private long endA;
         private long cursorB;
@@ -227,7 +227,6 @@ private static class Processor implements Runnable {
 
         // credit: merykitty
         private long parseAndStoreTemperature(long startCursor, long baseEntryPtr, long word) {
-            // long word = UNSAFE.getLong(startCursor);
             long countPtr = baseEntryPtr + MAP_COUNT_OFFSET;
             int cnt = UNSAFE.getInt(countPtr);
             UNSAFE.putInt(countPtr, cnt + 1);
@@ -297,18 +296,18 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
             this.endC = endC;
         }
 
-        private void doTail() {
+        private void doTail(long fastMAp) {
             doOne(cursorA, endA);
             doOne(cursorB, endB);
             doOne(cursorC, endC);
 
-            transferToHeap();
+            transferToHeap(fastMAp);
             // UNSAFE.freeMemory(fastMap);
             // UNSAFE.freeMemory(slowMap);
             // UNSAFE.freeMemory(slowMapNamesLo);
         }
 
-        private void transferToHeap() {
+        private void transferToHeap(long fastMap) {
             for (long baseAddress = slowMap; baseAddress < slowMap + SLOW_MAP_SIZE_BYTES; baseAddress += SLOW_MAP_ENTRY_SIZE_BYTES) {
                 long len = UNSAFE.getInt(baseAddress + MAP_LEN_OFFSET);
                 if (len == 0) {
@@ -398,7 +397,7 @@ public void run() {
             this.slowMap = UNSAFE.allocateMemory(SLOW_MAP_SIZE_BYTES);
             this.slowMapNamesPtr = UNSAFE.allocateMemory(SLOW_MAP_MAP_NAMES_BYTES);
             this.slowMapNamesLo = slowMapNamesPtr;
-            this.fastMap = UNSAFE.allocateMemory(FAST_MAP_SIZE_BYTES);
+            long fastMap = UNSAFE.allocateMemory(FAST_MAP_SIZE_BYTES);
             UNSAFE.setMemory(slowMap, SLOW_MAP_SIZE_BYTES, (byte) 0);
             UNSAFE.setMemory(fastMap, FAST_MAP_SIZE_BYTES, (byte) 0);
             UNSAFE.setMemory(slowMapNamesPtr, SLOW_MAP_MAP_NAMES_BYTES, (byte) 0);
@@ -528,38 +527,38 @@ public void run() {
                         baseEntryPtrA = getOrCreateEntryBaseOffsetSlow(lenA, startA, hashA, maskedLastWordA);
                     }
                     else {
-                        baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA);
+                        baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
                     }
 
                     if (slowB) {
                         baseEntryPtrB = getOrCreateEntryBaseOffsetSlow(lenB, startB, hashB, maskedLastWordB);
                     }
                     else {
-                        baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB);
+                        baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
                     }
 
                     if (slowC) {
                         baseEntryPtrC = getOrCreateEntryBaseOffsetSlow(lenC, startC, hashC, maskedLastWordC);
                     }
                     else {
-                        baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC);
+                        baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC, fastMap);
                     }
                 }
                 else {
-                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA);
-                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB);
-                    baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC);
+                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
+                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
+                    baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC, fastMap);
                 }
 
                 cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
                 cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
                 cursorC = parseAndStoreTemperature(digitStartC, baseEntryPtrC, temperatureWordC);
             }
-            doTail();
+            doTail(fastMap);
             // System.out.println("Longest chain: " + longestChain);
         }
 
-        private long getOrCreateEntryBaseOffsetFast(int mapIndexA, int lenA, long maskedLastWord, long maskedFirstWord) {
+        private static long getOrCreateEntryBaseOffsetFast(int mapIndexA, int lenA, long maskedLastWord, long maskedFirstWord, long fastMap) {
             for (;;) {
                 long basePtr = mapIndexA * FAST_MAP_ENTRY_SIZE_BYTES + fastMap;
                 long namePart1 = UNSAFE.getLong(basePtr + FAST_MAP_NAME_PART1);
@@ -596,7 +595,7 @@ private long getOrCreateEntryBaseOffsetSlow(int lenA, long startPtr, int hash, l
                 int len = UNSAFE.getInt(lenPtr);
                 if (len == lenA) {
                     namePtr = UNSAFE.getLong(basePtr + SLOW_MAP_NAME_OFFSET);
-                    if (nameMatch(startPtr, maskedLastWord, namePtr, fullLen)) {
+                    if (nameMatchSlow(startPtr, namePtr, fullLen, maskedLastWord)) {
                         return basePtr;
                     }
                 }
@@ -614,10 +613,6 @@ else if (len == 0) {
             }
         }
 
-        private static boolean nameMatch(long start, long maskedLastWord, long namePtr, long fullLen) {
-            return nameMatchSlow(start, namePtr, fullLen, maskedLastWord);
-        }
-
         private static boolean nameMatchSlow(long start, long namePtr, long fullLen, long maskedLastWord) {
             long offset;
             for (offset = 0; offset < fullLen; offset += 8) {

From 82197d4482ec8a5086713b4bbad88be6e9b59003 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksey=20Shipil=C3=ABv?= <aleksey@shipilev.net>
Date: Sun, 28 Jan 2024 22:49:34 +0100
Subject: [PATCH 180/268] shipilev: Amendments to version 4 (#627)

* Amendments

* One more locality touchup: no need to carry the entire name array
---
 calculate_average_shipilev.sh                 |   7 +-
 .../onebrc/CalculateAverage_shipilev.java     | 156 +++++++++++-------
 2 files changed, 99 insertions(+), 64 deletions(-)

diff --git a/calculate_average_shipilev.sh b/calculate_average_shipilev.sh
index 5d9f6334c..13a12cd68 100755
--- a/calculate_average_shipilev.sh
+++ b/calculate_average_shipilev.sh
@@ -15,12 +15,11 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="-XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xms64m -Xmx64m -XX:+AlwaysPreTouch -XX:+UseTransparentHugePages
--XX:-TieredCompilation -XX:CICompilerCount=2 -XX:-UseCountedLoopSafepoints -XX:+TrustFinalNonStaticFields
+JAVA_OPTS="-XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xms1g -Xmx1g -XX:-AlwaysPreTouch -XX:+UseTransparentHugePages
+-XX:-TieredCompilation -XX:-UseCountedLoopSafepoints -XX:+TrustFinalNonStaticFields -XX:CompileThreshold=2048
 --add-opens java.base/java.nio=ALL-UNNAMED --add-exports java.base/jdk.internal.ref=ALL-UNNAMED
 -XX:+UnlockDiagnosticVMOptions -XX:CompileCommand=quiet
 -XX:CompileCommand=dontinline,dev.morling.onebrc.CalculateAverage_shipilev\$ParsingTask::seqCompute
 -XX:CompileCommand=dontinline,dev.morling.onebrc.CalculateAverage_shipilev\$MeasurementsMap::updateSlow
--XX:CompileCommand=inline,dev.morling.onebrc.CalculateAverage_shipilev::nameMatches
--XX:CompileThreshold=2048"
+-XX:CompileCommand=inline,dev.morling.onebrc.CalculateAverage_shipilev\$Bucket::matches"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_shipilev
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java b/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
index 49989864c..1150f4296 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
@@ -102,45 +102,12 @@ public MeasurementsMap get() {
 
     // ========================= MEATY GRITTY PARTS: PARSE AND AGGREGATE =========================
 
-    // Little helper method to compare the array with given bytebuffer range.
-    public static boolean nameMatches(Bucket bucket, ByteBuffer cand, int begin, int end) {
-        byte[] orig = bucket.name;
-        int origLen = orig.length;
-        int candLen = end - begin;
-        if (origLen != candLen) {
-            return false;
-        }
-
-        // Check the tails first, to simplify the matches.
-        if (origLen >= 8) {
-            if (bucket.tail1 != cand.getLong(end - 8)) {
-                return false;
-            }
-            if (origLen >= 16) {
-                if (bucket.tail2 != cand.getLong(end - 16)) {
-                    return false;
-                }
-                origLen -= 16;
-            }
-            else {
-                origLen -= 8;
-            }
-        }
-
-        // Check the rest.
-        for (int i = 0; i < origLen; i++) {
-            if (orig[i] != cand.get(begin + i)) {
-                return false;
-            }
-        }
-        return true;
-    }
-
     public static final class Bucket {
-        // Raw station name, its hash, and tails.
-        public final byte[] name;
+        // Raw station name, its hash, and prefixes.
+        public final byte[] nameTail;
+        public final int len;
         public final int hash;
-        public final long tail1, tail2;
+        public final int prefix1, prefix2;
 
         // Temperature values, in 10x scale.
         public long sum;
@@ -148,10 +115,32 @@ public static final class Bucket {
         public int min;
         public int max;
 
-        public Bucket(byte[] name, long tail1, long tail2, int hash, int temp) {
-            this.name = name;
-            this.tail1 = tail1;
-            this.tail2 = tail2;
+        public Bucket(ByteBuffer slice, int begin, int end, int hash, int temp) {
+            len = end - begin;
+
+            // Also pick up any prefixes to simplify future matches.
+            int tailStart = 0;
+            if (len >= 8) {
+                prefix1 = slice.getInt(begin + 0);
+                prefix2 = slice.getInt(begin + 4);
+                tailStart += 8;
+            }
+            else if (len >= 4) {
+                prefix1 = slice.getInt(begin + 0);
+                prefix2 = 0;
+                tailStart += 4;
+            }
+            else {
+                prefix1 = 0;
+                prefix2 = 0;
+            }
+
+            // The rest goes to tail byte array. We are checking it names on hot-path.
+            // Therefore, it is convenient to keep allocation for names near the buckets.
+            int tailLen = len - tailStart;
+            nameTail = new byte[tailLen];
+            slice.get(begin + tailStart, nameTail, 0, tailLen);
+
             this.hash = hash;
             this.sum = temp;
             this.count = 1;
@@ -159,6 +148,48 @@ public Bucket(byte[] name, long tail1, long tail2, int hash, int temp) {
             this.max = temp;
         }
 
+        // Little helper method to compare the array with given bytebuffer range.
+        public boolean matches(ByteBuffer cand, int begin, int end) {
+            int origLen = len;
+            int candLen = end - begin;
+            if (origLen != candLen) {
+                return false;
+            }
+
+            // Check the prefixes first, to simplify the matches.
+            int tailStart = 0;
+            if (origLen >= 8) {
+                if (prefix1 != cand.getInt(begin)) {
+                    return false;
+                }
+                if (prefix2 != cand.getInt(begin + 4)) {
+                    return false;
+                }
+                tailStart += 8;
+            }
+            else if (origLen >= 4) {
+                if (prefix1 != cand.getInt(begin)) {
+                    return false;
+                }
+                tailStart += 4;
+            }
+
+            // Check the rest.
+            for (int i = 0; i < origLen - tailStart; i++) {
+                if (nameTail[i] != cand.get(begin + tailStart + i)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        public boolean matches(Bucket other) {
+            return len == other.len &&
+                    prefix1 == other.prefix1 &&
+                    prefix2 == other.prefix2 &&
+                    Arrays.equals(nameTail, other.nameTail);
+        }
+
         public void merge(int value) {
             sum += value;
             count++;
@@ -178,8 +209,19 @@ public void merge(Bucket s) {
         }
 
         public Row toRow() {
+            // Reconstruct the name
+            ByteBuffer bb = ByteBuffer.allocate(len);
+            bb.order(ByteOrder.LITTLE_ENDIAN);
+            if (len >= 4) {
+                bb.putInt(prefix1);
+            }
+            if (len >= 8) {
+                bb.putInt(prefix2);
+            }
+            bb.put(nameTail);
+
             return new Row(
-                    new String(name),
+                    new String(Arrays.copyOf(bb.array(), len)),
                     Math.round((double) min) / 10.0,
                     Math.round((double) sum / count) / 10.0,
                     Math.round((double) max) / 10.0);
@@ -205,21 +247,11 @@ private void updateSlow(ByteBuffer name, int begin, int end, int hash, int temp)
             while (true) {
                 Bucket cur = buckets[idx];
                 if (cur == null) {
-                    // No bucket yet, lucky us. Lookup the name and create the bucket with it.
-                    // We are checking the names on hot-path. Therefore, it is convenient
-                    // to keep allocation for names near the buckets.
-                    int len = end - begin;
-                    byte[] copy = new byte[len];
-                    name.get(begin, copy, 0, len);
-
-                    // Also pick up any tail to simplify future matches.
-                    long tail1 = (len < 8) ? 0 : name.getLong(begin + len - 8);
-                    long tail2 = (len < 16) ? 0 : name.getLong(begin + len - 16);
-
-                    buckets[idx] = new Bucket(copy, tail1, tail2, hash, temp);
+                    // No bucket yet, lucky us. Create the bucket with it.
+                    buckets[idx] = new Bucket(name, begin, end, hash, temp);
                     return;
                 }
-                else if ((cur.hash == hash) && nameMatches(cur, name, begin, end)) {
+                else if ((cur.hash == hash) && cur.matches(name, begin, end)) {
                     // Same as bucket fastpath. Check for collision by checking the full hash
                     // first (since the index is truncated by map size), and then the exact name.
                     cur.merge(temp);
@@ -244,7 +276,7 @@ public void merge(MeasurementsMap otherMap) {
                         buckets[idx] = other;
                         break;
                     }
-                    else if ((cur.hash == other.hash) && Arrays.equals(cur.name, other.name)) {
+                    else if ((cur.hash == other.hash) && cur.matches(other)) {
                         cur.merge(other);
                         break;
                     }
@@ -425,7 +457,7 @@ private void seqCompute(MeasurementsMap map, ByteBuffer origSlice, int length) t
 
                 // Time to update!
                 Bucket bucket = buckets[nameHash & (MAP_SIZE - 1)];
-                if ((bucket != null) && (nameHash == bucket.hash) && nameMatches(bucket, slice, nameBegin, nameEnd)) {
+                if ((bucket != null) && (nameHash == bucket.hash) && bucket.matches(slice, nameBegin, nameEnd)) {
                     // Lucky fast path, existing bucket hit. Most of the time we complete here.
                     bucket.merge(temp);
                 }
@@ -447,8 +479,8 @@ private void seqCompute(MeasurementsMap map, ByteBuffer origSlice, int length) t
     // a given mmap slice, while there is still other work to do. This allows
     // us to unmap slices on the go.
     public static final class RootTask extends CountedCompleter<Void> {
-        public RootTask(CountedCompleter<Void> parent) {
-            super(parent);
+        public RootTask() {
+            super(null);
         }
 
         @Override
@@ -517,8 +549,12 @@ public void onCompletion(CountedCompleter<?> caller) {
     // ========================= Invocation =========================
 
     public static void main(String[] args) throws Exception {
+        // Instantiate a separate FJP to match the parallelism accurately, without
+        // relying on common pool defaults.
+        ForkJoinPool pool = new ForkJoinPool(Runtime.getRuntime().availableProcessors());
+
         // This little line carries the whole world
-        new RootTask(null).fork();
+        pool.submit(new RootTask());
 
         // While the root task is working, prepare what we need for the
         // end of the run. Go and try to report something to prepare the

From 99367dbc50d0eaa436a3ef89c5646a367b298404 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 22:52:12 +0100
Subject: [PATCH 181/268] Update pull_request_template.md

---
 .github/pull_request_template.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 6f71c4517..d64530206 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,4 +1,6 @@
 #### Check List:
+
+- [ ] You have run `.mvnv verify` and the project builds successfully
 - [ ] Tests pass (`./test.sh <username>` shows no differences between expected and actual outputs)
 - [ ] All formatting changes by the build are committed
 - [ ] Your launch script is named `calculate_average_<username>.sh` (make sure to match casing of your GH user name) and is executable

From 87f3b7170183f329bc060bb92d0337e7057336d6 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 22:52:29 +0100
Subject: [PATCH 182/268] Update pull_request_template.md

---
 .github/pull_request_template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index d64530206..02eddf5ab 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,6 +1,6 @@
 #### Check List:
 
-- [ ] You have run `.mvnv verify` and the project builds successfully
+- [ ] You have run `.mvnw verify` and the project builds successfully
 - [ ] Tests pass (`./test.sh <username>` shows no differences between expected and actual outputs)
 - [ ] All formatting changes by the build are committed
 - [ ] Your launch script is named `calculate_average_<username>.sh` (make sure to match casing of your GH user name) and is executable

From 99a754d0cd20a4b535e2dd5774b688726db5525a Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 22:52:44 +0100
Subject: [PATCH 183/268] Update pull_request_template.md

---
 .github/pull_request_template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 02eddf5ab..9b55c8f63 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,6 +1,6 @@
 #### Check List:
 
-- [ ] You have run `.mvnw verify` and the project builds successfully
+- [ ] You have run `./mvnw verify` and the project builds successfully
 - [ ] Tests pass (`./test.sh <username>` shows no differences between expected and actual outputs)
 - [ ] All formatting changes by the build are committed
 - [ ] Your launch script is named `calculate_average_<username>.sh` (make sure to match casing of your GH user name) and is executable

From 9282fb7b0a235a53416893e7532796bf10219b35 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Sun, 28 Jan 2024 22:56:33 +0100
Subject: [PATCH 184/268] processing three at once (#626)

---
 .../CalculateAverage_artsiomkorzun.java       | 162 ++++++++++--------
 1 file changed, 94 insertions(+), 68 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index bb2198f5d..2a1a387d5 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -34,7 +34,6 @@ public class CalculateAverage_artsiomkorzun {
 
     private static final Path FILE = Path.of("./measurements.txt");
     private static final long SEGMENT_SIZE = 4 * 1024 * 1024;
-    private static final long SEGMENT_OVERLAP = 128;
     private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
@@ -363,15 +362,19 @@ public void run() {
 
             for (int segment; (segment = counter.getAndIncrement()) < segmentCount;) {
                 long position = SEGMENT_SIZE * segment;
-                long size = Math.min(SEGMENT_SIZE + SEGMENT_OVERLAP, fileSize - position);
-                long address = fileAddress + position;
-                long limit = address + Math.min(SEGMENT_SIZE, size - 1);
+                long size = Math.min(SEGMENT_SIZE, fileSize - position - 1);
+                long start = fileAddress + position;
+                long end = start + size;
 
                 if (segment > 0) {
-                    address = next(address);
+                    start = next(start);
                 }
 
-                aggregate(aggregates, address, limit);
+                long chunk = (end - start) / 3;
+                long left = next(start + chunk);
+                long right = next(start + chunk + chunk);
+
+                aggregate(aggregates, start, left - 1, left, right - 1, right, end);
             }
 
             while (!result.compareAndSet(null, aggregates)) {
@@ -390,94 +393,117 @@ private static long next(long position) {
             return position;
         }
 
-        private static void aggregate(Aggregates aggregates, long position, long limit) {
-            // this parsing can produce seg fault at page boundaries
-            // e.g. file size is 4096 and the last entry is X=0.0, which is less than 8 bytes
-            // as a result a read will be split across pages, where one of them is not mapped
-            // but for some reason it works on my machine, leaving to investigate
+        private static void aggregate(Aggregates aggregates, long position1, long limit1, long position2, long limit2, long position3, long limit3) {
+            while (position1 <= limit1 && position2 <= limit2 && position3 <= limit3) {
+                long word1 = word(position1);
+                long word2 = word(position2);
+                long word3 = word(position3);
+
+                long separator1 = separator(word1);
+                long separator2 = separator(word2);
+                long separator3 = separator(word3);
+
+                position1 = process(aggregates, position1, word1, separator1);
+                position2 = process(aggregates, position2, word2, separator2);
+                position3 = process(aggregates, position3, word3, separator3);
+            }
+
+            while (position1 <= limit1) {
+                long word1 = word(position1);
+                long separator1 = separator(word1);
+                position1 = process(aggregates, position1, word1, separator1);
+            }
+
+            while (position2 <= limit2) {
+                long word2 = word(position2);
+                long separator2 = separator(word2);
+                position2 = process(aggregates, position2, word2, separator2);
+            }
+
+            while (position3 <= limit3) {
+                long word3 = word(position3);
+                long separator3 = separator(word3);
+                position3 = process(aggregates, position3, word3, separator3);
+            }
+        }
+
+        private static long process(Aggregates aggregates, long position, long word, long separator) {
+            long end = position;
+
+            int length;
+            int hash;
+            int value;
+
+            if (separator != 0) {
+                length = length(separator);
+                word = mask(word, separator);
+                hash = mix(word);
+                end += length;
 
-            while (position <= limit) { // branchy version, credit: thomaswue
-                int length;
-                int hash;
-                int value;
+                long num = word(end);
+                int dot = dot(num);
+                value = value(num, dot);
+                end += (dot >> 3) + 3;
+                long pointer = aggregates.find(word, hash);
 
-                long word = word(position);
-                long separator = separator(word);
-                long end = position;
+                if (pointer != 0) {
+                    Aggregates.update(pointer, value);
+                    return end;
+                }
+            }
+            else {
+                long word0 = word;
+                word = word(end + 8);
+                separator = separator(word);
 
                 if (separator != 0) {
-                    length = length(separator);
+                    length = length(separator) + 8;
                     word = mask(word, separator);
-                    hash = mix(word);
+                    hash = mix(word ^ word0);
                     end += length;
 
                     long num = word(end);
                     int dot = dot(num);
                     value = value(num, dot);
                     end += (dot >> 3) + 3;
-                    long ptr = aggregates.find(word, hash);
+                    long pointer = aggregates.find(word0, word, hash);
 
-                    if (ptr != 0) {
-                        Aggregates.update(ptr, value);
-                        position = end;
-                        continue;
+                    if (pointer != 0) {
+                        Aggregates.update(pointer, value);
+                        return end;
                     }
                 }
                 else {
-                    long word0 = word;
-                    word = word(position + 8);
-                    separator = separator(word);
+                    length = 16;
+                    long h = word ^ word0;
 
-                    if (separator != 0) {
-                        length = length(separator) + 8;
+                    while (true) {
+                        word = word(end + length);
+                        separator = separator(word);
+
+                        if (separator == 0) {
+                            length += 8;
+                            h ^= word;
+                            continue;
+                        }
+
+                        length += length(separator);
                         word = mask(word, separator);
-                        hash = mix(word ^ word0);
+                        hash = mix(h ^ word);
                         end += length;
 
                         long num = word(end);
                         int dot = dot(num);
                         value = value(num, dot);
                         end += (dot >> 3) + 3;
-                        long ptr = aggregates.find(word0, word, hash);
-
-                        if (ptr != 0) {
-                            Aggregates.update(ptr, value);
-                            position = end;
-                            continue;
-                        }
-                    }
-                    else {
-                        length = 16;
-                        long h = word ^ word0;
-
-                        while (true) {
-                            word = word(position + length);
-                            separator = separator(word);
-
-                            if (separator == 0) {
-                                length += 8;
-                                h ^= word;
-                                continue;
-                            }
-
-                            length += length(separator);
-                            word = mask(word, separator);
-                            hash = mix(h ^ word);
-                            end += length;
-
-                            long num = word(end);
-                            int dot = dot(num);
-                            value = value(num, dot);
-                            end += (dot >> 3) + 3;
-                            break;
-                        }
+                        break;
                     }
                 }
-
-                long ptr = aggregates.put(position, word, length, hash);
-                Aggregates.update(ptr, value);
-                position = end;
             }
+
+            long pointer = aggregates.put(position, word, length, hash);
+            Aggregates.update(pointer, value);
+            return end;
         }
 
         private static long separator(long word) {

From ff35a4628b7dd5d0b5050345d9de8840d2b3b053 Mon Sep 17 00:00:00 2001
From: Andrzej Nestoruk <and.nestoruk@gmail.com>
Date: Sun, 28 Jan 2024 22:59:04 +0100
Subject: [PATCH 185/268] anestoruk second attempt (#625)

* initial implementation

* few improvements and a cleanup (down to ~12s)

* use array instead of hashmap for collecting partial results
---
 .../onebrc/CalculateAverage_anestoruk.java    | 120 +++++++++---------
 1 file changed, 61 insertions(+), 59 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
index add938fca..293087e2f 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
@@ -22,9 +22,7 @@
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutionException;
@@ -69,14 +67,14 @@ public static void main(String[] args) throws IOException {
 
         TreeMap<String, Record> result = new TreeMap<>();
         try (ExecutorService executor = Executors.newFixedThreadPool(cpus)) {
-            List<CompletableFuture<Map<ByteWrapper, Record>>> futures = new ArrayList<>();
+            List<CompletableFuture<Record[]>> futures = new ArrayList<>();
             for (SegmentRange range : rangeList) {
                 futures.add(supplyAsync(() -> process(range, segment), executor));
             }
-            for (CompletableFuture<Map<ByteWrapper, Record>> future : futures) {
+            for (CompletableFuture<Record[]> future : futures) {
                 try {
-                    Map<ByteWrapper, Record> partialResult = future.get();
-                    combine(result, partialResult);
+                    Record[] partialResult = future.get();
+                    mergeResult(result, partialResult);
                 }
                 catch (InterruptedException | ExecutionException ex) {
                     throw new RuntimeException(ex);
@@ -87,20 +85,19 @@ public static void main(String[] args) throws IOException {
         System.out.println(result);
     }
 
-    private static Map<ByteWrapper, Record> process(SegmentRange range, MemorySegment segment) {
-        Map<ByteWrapper, Record> partialResult = new HashMap<>(1_000);
-        byte[] buffer = new byte[100];
+    private static Record[] process(SegmentRange range, MemorySegment segment) {
+        Record[] records = new Record[1024 * 100];
+        byte[] cityBuffer = new byte[100];
         long offset = range.startOffset;
         byte b;
         while (offset < range.endOffset) {
-            int cityIdx = 0;
+            int cityLength = 0;
+            int hash = 0;
             while ((b = segment.get(JAVA_BYTE, offset++)) != ';') {
-                buffer[cityIdx++] = b;
+                cityBuffer[cityLength++] = b;
+                hash = hash * 31 + b;
             }
-            byte[] city = new byte[cityIdx];
-            System.arraycopy(buffer, 0, city, 0, cityIdx);
-            ByteWrapper cityWrapper = new ByteWrapper(city);
-
+            hash = Math.abs(hash);
             int value = 0;
             boolean negative;
             if ((b = segment.get(JAVA_BYTE, offset++)) == '-') {
@@ -116,45 +113,77 @@ private static Map<ByteWrapper, Record> process(SegmentRange range, MemorySegmen
                 }
             }
             int temperature = negative ? -value : value;
-
-            partialResult.compute(cityWrapper, (_, record) -> update(record, temperature));
+            byte[] city = new byte[cityLength];
+            System.arraycopy(cityBuffer, 0, city, 0, cityLength);
+            addResult(records, hash, city, temperature);
         }
-        return partialResult;
+        return records;
     }
 
-    private record SegmentRange(long startOffset, long endOffset) {
+    private static void addResult(Record[] records, int hash, byte[] city, int temperature) {
+        int idx = hash % records.length;
+        Record record;
+        while ((record = records[idx]) != null) {
+            if (record.hash == hash && Arrays.equals(record.city, city)) {
+                record.add(temperature);
+                return;
+            }
+            idx = (idx + 1) % records.length;
+        }
+        records[idx] = new Record(hash, city, temperature);
     }
 
-    private record ByteWrapper(byte[] bytes) {
-
-        @Override
-        public boolean equals(Object o) {
-            if (this == o) return true;
-            if (o == null || getClass() != o.getClass()) return false;
-            ByteWrapper that = (ByteWrapper) o;
-            return Arrays.equals(bytes, that.bytes);
+    private static void mergeResult(TreeMap<String, Record> result, Record[] partialResult) {
+        for (Record partialRecord : partialResult) {
+            if (partialRecord == null) {
+                continue;
+            }
+            String cityName = new String(partialRecord.city, UTF_8);
+            result.compute(cityName, (_, record) -> {
+                if (record == null) {
+                    return partialRecord;
+                }
+                record.merge(partialRecord);
+                return record;
+            });
         }
+    }
 
-        @Override
-        public int hashCode() {
-            return Arrays.hashCode(bytes);
-        }
+    private record SegmentRange(long startOffset, long endOffset) {
     }
 
     private static class Record {
 
+        private final int hash;
+        private final byte[] city;
         private int min;
         private int max;
         private long sum;
         private int count;
 
-        public Record(int temperature) {
+        public Record(int hash, byte[] city, int temperature) {
+            this.hash = hash;
+            this.city = city;
             this.min = temperature;
             this.max = temperature;
             this.sum = temperature;
             this.count = 1;
         }
 
+        public void add(int temperature) {
+            min = min(min, temperature);
+            max = max(max, temperature);
+            sum += temperature;
+            count++;
+        }
+
+        public void merge(Record other) {
+            min = min(min, other.min);
+            max = max(max, other.max);
+            sum += other.sum;
+            count += other.count;
+        }
+
         @Override
         public String toString() {
             return "%.1f/%.1f/%.1f".formatted(
@@ -163,31 +192,4 @@ public String toString() {
                     (max / 10.0));
         }
     }
-
-    private static Record update(Record record, int temperature) {
-        if (record == null) {
-            return new Record(temperature);
-        }
-        record.min = min(record.min, temperature);
-        record.max = max(record.max, temperature);
-        record.sum += temperature;
-        record.count++;
-        return record;
-    }
-
-    private static void combine(TreeMap<String, Record> result, Map<ByteWrapper, Record> partialResult) {
-        partialResult.forEach((wrapper, partialRecord) -> {
-            String city = new String(wrapper.bytes, UTF_8);
-            result.compute(city, (_, record) -> {
-                if (record == null) {
-                    return partialRecord;
-                }
-                record.min = min(record.min, partialRecord.min);
-                record.max = max(record.max, partialRecord.max);
-                record.sum += partialRecord.sum;
-                record.count += partialRecord.count;
-                return record;
-            });
-        });
-    }
 }

From 46d375e621d1ae286058407fc38d55600fc679eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <sozal@catchpoint.com>
Date: Mon, 29 Jan 2024 01:02:01 +0300
Subject: [PATCH 186/268] serkan-ozal's 3rd submission with some minor
 improvements: (#615)

- faster merge by ignoring empty entries in the map
- enable CDS for faster startup (added `prepare_serkan-ozal.sh` to generate CDS archive in advance)
- some tweaks with JVM options
- optimized result printing
---
 calculate_average_serkan-ozal.sh              |  7 ++-
 prepare_serkan-ozal.sh                        | 42 +++++++++++++++
 .../onebrc/CalculateAverage_serkan_ozal.java  | 54 +++++++++++++------
 3 files changed, 85 insertions(+), 18 deletions(-)
 create mode 100755 prepare_serkan-ozal.sh

diff --git a/calculate_average_serkan-ozal.sh b/calculate_average_serkan-ozal.sh
index 857979b27..c075fc20b 100755
--- a/calculate_average_serkan-ozal.sh
+++ b/calculate_average_serkan-ozal.sh
@@ -15,10 +15,13 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector"
+JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector "
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UnlockDiagnosticVMOptions"
 JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:MaxInlineSize=10000 -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+JAVA_OPTS="$JAVA_OPTS -XX:-UseCountedLoopSafepoints -XX:GuaranteedSafepointInterval=0"
+JAVA_OPTS="$JAVA_OPTS -XX:+TrustFinalNonStaticFields -da -dsa -XX:+UseNUMA -XX:-EnableJVMCI"
+JAVA_OPTS="$JAVA_OPTS -XX:SharedArchiveFile=target/CalculateAverage_serkan_ozal_cds.jsa"
 JAVA_OPTS="$JAVA_OPTS -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0"
-#JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xms256m -Xmx256m -XX:+AlwaysPreTouch"
 if [[ ! "$(uname -s)" = "Darwin" ]]; then
   JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
 fi
diff --git a/prepare_serkan-ozal.sh b/prepare_serkan-ozal.sh
new file mode 100755
index 000000000..75df48a5b
--- /dev/null
+++ b/prepare_serkan-ozal.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.1-open 1>&2
+
+JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector "
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UnlockDiagnosticVMOptions"
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:MaxInlineSize=10000 -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+JAVA_OPTS="$JAVA_OPTS -XX:-UseCountedLoopSafepoints -XX:GuaranteedSafepointInterval=0"
+JAVA_OPTS="$JAVA_OPTS -XX:+TrustFinalNonStaticFields -da -dsa -XX:+UseNUMA -XX:-EnableJVMCI"
+JAVA_OPTS="$JAVA_OPTS -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0"
+JAVA_OPTS="${JAVA_OPTS} -Dfile.path=src/test/resources/samples/measurements-10000-unique-keys.txt"
+if [[ ! "$(uname -s)" = "Darwin" ]]; then
+  JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+fi
+
+# Set configs
+export USE_SHARED_ARENA=true
+export USE_SHARED_REGION=true
+export CLOSE_STDOUT_ON_RESULT=true
+
+CLASS_NAME="CalculateAverage_serkan_ozal"
+
+# Create CDS archive
+java ${JAVA_OPTS} -Xshare:off -XX:DumpLoadedClassList=target/${CLASS_NAME}.classlist --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.${CLASS_NAME}
+java ${JAVA_OPTS} -Xshare:dump -XX:SharedClassListFile=target/${CLASS_NAME}.classlist -XX:SharedArchiveFile=target/${CLASS_NAME}.jsa --class-path target/average-1.0.0-SNAPSHOT.jar
+java ${JAVA_OPTS} -Xshare:on -XX:SharedArchiveFile=target/${CLASS_NAME}.jsa -XX:ArchiveClassesAtExit=target/${CLASS_NAME}_cds.jsa --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.${CLASS_NAME}
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
index 8087919a6..0ca1fe7ee 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -30,6 +30,7 @@
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
@@ -47,7 +48,7 @@
  */
 public class CalculateAverage_serkan_ozal {
 
-    private static final String FILE = "./measurements.txt";
+    private static final String FILE = System.getProperty("file.path", "./measurements.txt");
 
     private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_PREFERRED.length() >= 16
             // Since majority (99%) of the city names <= 16 bytes, according to my experiments,
@@ -327,7 +328,7 @@ private void processRegion() throws Exception {
 
         private void doProcessRegion(MemorySegment region, long regionAddress, long regionStart, long regionEnd) {
             final int vectorSize = BYTE_SPECIES.vectorByteSize();
-            final long regionMainLimit = regionEnd - MAX_LINE_LENGTH;
+            final long regionMainLimit = regionEnd - BYTE_SPECIES_SIZE;
 
             long regionPtr;
 
@@ -515,7 +516,20 @@ private void mergeInto(OpenMap map) {
         }
 
         private void print() {
-            System.out.println(resultMap);
+            StringBuilder sb = new StringBuilder(1 << 14);
+            boolean firstEntryAppended = false;
+            sb.append("{");
+            for (Map.Entry<String, KeyResult> e : resultMap.entrySet()) {
+                if (firstEntryAppended) {
+                    sb.append(", ");
+                }
+                String key = e.getKey();
+                KeyResult value = e.getValue();
+                sb.append(key).append("=").append(value);
+                firstEntryAppended = true;
+            }
+            sb.append('}');
+            System.out.println(sb);
         }
 
     }
@@ -546,8 +560,12 @@ private static final class OpenMap {
         private static final int ENTRY_HASH_MASK = MAP_CAPACITY - 1;
         private static final int MAP_SIZE = ENTRY_SIZE * MAP_CAPACITY;
         private static final int ENTRY_MASK = MAP_SIZE - 1;
+        private static final int KEY_ARRAY_OFFSET = KEY_OFFSET - Unsafe.ARRAY_BYTE_BASE_OFFSET;
 
         private final byte[] data;
+        // Max number of unique keys are 10K, so 1 << 14 (16384) is long enough to hold offsets for all of them
+        private final long[] entryOffsets = new long[1 << 14];
+        private int entryOffsetIdx = 0;
 
         private OpenMap() {
             this.data = new byte[MAP_SIZE];
@@ -579,7 +597,6 @@ private long putKey(ByteVector keyVector, long keyStartAddress, int keyLength) {
             // and continue until find an available slot in case of hash collision
             // TODO Prevent infinite loop if all the slots are in use for other keys
             for (long entryOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + (idx * ENTRY_SIZE);; entryOffset = (entryOffset + ENTRY_SIZE) & ENTRY_MASK) {
-                int keyStartOffset = (int) entryOffset + KEY_OFFSET;
                 int keySize = U.getInt(data, entryOffset + KEY_SIZE_OFFSET);
                 // Check whether current index is empty (no another key is inserted yet)
                 if (keySize == 0) {
@@ -587,26 +604,28 @@ private long putKey(ByteVector keyVector, long keyStartAddress, int keyLength) {
                     U.putShort(data, entryOffset + MIN_VALUE_OFFSET, Short.MAX_VALUE);
                     U.putShort(data, entryOffset + MAX_VALUE_OFFSET, Short.MIN_VALUE);
                     U.putInt(data, entryOffset + KEY_SIZE_OFFSET, keyLength);
-                    U.copyMemory(null, keyStartAddress, data, keyStartOffset, keyLength);
+                    U.copyMemory(null, keyStartAddress, data, entryOffset + KEY_OFFSET, keyLength);
+                    entryOffsets[entryOffsetIdx++] = entryOffset;
                     return entryOffset;
                 }
+                int keyStartArrayOffset = (int) entryOffset + KEY_ARRAY_OFFSET;
                 // Check for hash collision (hashes are same, but keys are different).
                 // If there is no collision (both hashes and keys are equals), return current slot's offset.
                 // Otherwise, continue iterating until find an available slot.
-                if (keySize == keyLength && keysEqual(keyVector, keyStartAddress, keyLength, keyStartOffset)) {
+                if (keySize == keyLength && keysEqual(keyVector, keyStartAddress, keyLength, keyStartArrayOffset)) {
                     return entryOffset;
                 }
             }
         }
 
-        private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLength, int keyStartOffset) {
+        private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLength, int keyStartArrayOffset) {
             int keyCheckIdx = 0;
             if (keyVector != null) {
                 // Use vectorized search for the comparison of keys.
                 // Since majority of the city names >= 8 bytes and <= 16 bytes,
                 // this way is more efficient (according to my experiments) than any other comparisons (byte by byte or 2 longs).
                 int keyCheckLength = Math.min(BYTE_SPECIES_SIZE, keyLength);
-                ByteVector entryKeyVector = ByteVector.fromArray(BYTE_SPECIES, data, keyStartOffset - Unsafe.ARRAY_BYTE_BASE_OFFSET);
+                ByteVector entryKeyVector = ByteVector.fromArray(BYTE_SPECIES, data, keyStartArrayOffset);
                 long eqMask = keyVector.compare(VectorOperators.EQ, entryKeyVector).toLong();
                 int eqCount = Long.numberOfTrailingZeros(~eqMask);
                 if (eqCount < keyCheckLength) {
@@ -625,6 +644,7 @@ private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLen
                 normalizedKeyLength = Integer.reverseBytes(normalizedKeyLength);
             }
 
+            long keyStartOffset = keyStartArrayOffset + Unsafe.ARRAY_BYTE_BASE_OFFSET;
             int alignedKeyLength = normalizedKeyLength & 0xFFFFFFF8;
             int i;
             for (i = keyCheckIdx; i < alignedKeyLength; i += Long.BYTES) {
@@ -663,18 +683,20 @@ private void putValue(long entryOffset, int value) {
 
         private void merge(Map<String, KeyResult> resultMap) {
             // Merge this local map into global result map
-            for (int i = 0; i < MAP_SIZE; i += ENTRY_SIZE) {
-                int baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + i;
-                int keyLength = U.getInt(data, baseOffset + KEY_SIZE_OFFSET);
+            Arrays.sort(entryOffsets, 0, entryOffsetIdx);
+            for (int i = 0; i < entryOffsetIdx; i++) {
+                long entryOffset = entryOffsets[i];
+                int keyLength = U.getInt(data, entryOffset + KEY_SIZE_OFFSET);
                 if (keyLength == 0) {
                     // No entry is available for this index, so continue iterating
                     continue;
                 }
-                String key = new String(data, i + KEY_OFFSET, keyLength, StandardCharsets.UTF_8);
-                int count = U.getInt(data, baseOffset + COUNT_OFFSET);
-                short minValue = U.getShort(data, baseOffset + MIN_VALUE_OFFSET);
-                short maxValue = U.getShort(data, baseOffset + MAX_VALUE_OFFSET);
-                long sum = U.getLong(data, baseOffset + VALUE_SUM_OFFSET);
+                int entryArrayIdx = (int) (entryOffset + KEY_OFFSET - Unsafe.ARRAY_BYTE_BASE_OFFSET);
+                String key = new String(data, entryArrayIdx, keyLength, StandardCharsets.UTF_8);
+                int count = U.getInt(data, entryOffset + COUNT_OFFSET);
+                short minValue = U.getShort(data, entryOffset + MIN_VALUE_OFFSET);
+                short maxValue = U.getShort(data, entryOffset + MAX_VALUE_OFFSET);
+                long sum = U.getLong(data, entryOffset + VALUE_SUM_OFFSET);
                 KeyResult result = new KeyResult(count, minValue, maxValue, sum);
                 KeyResult existingResult = resultMap.get(key);
                 if (existingResult == null) {

From be5b3318b150243c0ea24def82c2b8d6f3f1e06d Mon Sep 17 00:00:00 2001
From: giovannicuccu <giovanni.cuccu@gmail.com>
Date: Sun, 28 Jan 2024 23:24:47 +0100
Subject: [PATCH 187/268] Solution without unsafe using vector API (#602)

* Solution without unsafe

* Solution without unsafe

* Solution without unsafe, remove the usage of bytebuffer, passes the create_measurements3 test

* bug fix for 10k test, update also the CreateMeasurements3.java to use '\n' as newline instead of the os value (if it runs on windows it uses crlf and "breaks" the file format )

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
---
 calculate_average_giovannicuccu.sh            |   2 +-
 github_users.txt                              |   1 +
 prepare_giovannicuccu.sh                      |   0
 .../CalculateAverage_giovannicuccu.java       | 397 ++++++++++--------
 .../morling/onebrc/CreateMeasurements3.java   |   2 +-
 5 files changed, 224 insertions(+), 178 deletions(-)
 mode change 100755 => 100644 calculate_average_giovannicuccu.sh
 mode change 100755 => 100644 prepare_giovannicuccu.sh

diff --git a/calculate_average_giovannicuccu.sh b/calculate_average_giovannicuccu.sh
old mode 100755
new mode 100644
index 314b5d8a7..218838559
--- a/calculate_average_giovannicuccu.sh
+++ b/calculate_average_giovannicuccu.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS=""
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector -XX:-TieredCompilation"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_giovannicuccu
diff --git a/github_users.txt b/github_users.txt
index 497909c78..eb3ac2ca1 100644
--- a/github_users.txt
+++ b/github_users.txt
@@ -1,3 +1,4 @@
+giovannicuccu;Giovanni Cuccu
 Ujjwalbharti;Ujjwal Bharti
 abfrmblr;Abhilash
 ags313;ags
diff --git a/prepare_giovannicuccu.sh b/prepare_giovannicuccu.sh
old mode 100755
new mode 100644
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java b/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
index 7b549dc06..7123c2c14 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
@@ -15,10 +15,19 @@
  */
 package dev.morling.onebrc;
 
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.IntVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
 import static java.util.stream.Collectors.*;
 
 import java.io.IOException;
 import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
@@ -31,34 +40,42 @@
 import java.util.concurrent.*;
 
 /*
- Solution without unsafe that borrows the ideas of splullara, thomasvue, royvanrijn
+ Solution without unsafe that borrows the ideas of splullara, thomasvue, royvanrijn and merykitty
  */
 
 public class CalculateAverage_giovannicuccu {
 
     private static final String FILE = "./measurements.txt";
 
-    public static record PartitionBoundary(long start, long end) {
+    private static final VectorSpecies<Byte> BYTE_SPECIES = ByteVector.SPECIES_256;
+    private static final int BYTE_SPECIES_LANES = BYTE_SPECIES.length();
+    private static final ByteOrder NATIVE_ORDER = ByteOrder.nativeOrder();
+    public static final VectorSpecies<Integer> INT_SPECIES = IntVector.SPECIES_256;
+    public static final int INT_SPECIES_LANES = INT_SPECIES.length();
+
+    public static final int KEY_SIZE = 128;
+
+    public static record PartitionBoundary(Path path, long start, long end) {
     }
 
     public static interface PartitionCalculator {
-        PartitionBoundary[] computePartitionsBoundaries(Path path);
+        List<PartitionBoundary> computePartitionsBoundaries(Path path);
     }
 
     public static class ProcessorPartitionCalculator implements PartitionCalculator {
 
-        public PartitionBoundary[] computePartitionsBoundaries(Path path) {
+        public List<PartitionBoundary> computePartitionsBoundaries(Path path) {
             try {
                 int numberOfSegments = Runtime.getRuntime().availableProcessors();
                 long fileSize = path.toFile().length();
                 long segmentSize = fileSize / numberOfSegments;
-                PartitionBoundary[] segmentBoundaries = new PartitionBoundary[numberOfSegments];
+                List<PartitionBoundary> segmentBoundaries = new ArrayList<>(numberOfSegments);
                 try (RandomAccessFile randomAccessFile = new RandomAccessFile(path.toFile(), "r")) {
                     long segStart = 0;
                     long segEnd = segmentSize;
                     for (int i = 0; i < numberOfSegments; i++) {
                         segEnd = findEndSegment(randomAccessFile, segEnd, fileSize);
-                        segmentBoundaries[i] = new PartitionBoundary(segStart, segEnd);
+                        segmentBoundaries.add(new PartitionBoundary(path, segStart, segEnd));
                         segStart = segEnd;
                         segEnd = Math.min(segEnd + segmentSize, fileSize);
                     }
@@ -81,51 +98,27 @@ private long findEndSegment(RandomAccessFile raf, long location, long fileSize)
         }
     }
 
-    public static class MeasurementAggregator {
-        private final int hash;
+    private static class MeasurementAggregatorVectorized {
+
         private int min;
         private int max;
         private double sum;
         private long count;
-        private final byte[] station;
-        private final int offset;
-        private final String name;
+        private final int len;
+        private final int hash;
 
-        private final long[] data;
-        private final int dataOffset;
+        private final int offset;
+        private byte[] data;
 
-        public MeasurementAggregator(byte[] station, int offset, int hash, int initialValue, long[] data, int dataOffset) {
+        public MeasurementAggregatorVectorized(byte[] data, int offset, int len, int hash, int initialValue) {
             min = initialValue;
             max = initialValue;
             sum = initialValue;
             count = 1;
-            this.station = station;
-            this.offset = offset;
+            this.len = len;
             this.hash = hash;
-            this.data = data;
-            this.dataOffset = dataOffset;
-            this.name = new String(station, 0, offset, StandardCharsets.UTF_8);
-        }
-
-        public MeasurementAggregator(byte[] station, int offset, int hash, int initialValue) {
-            min = initialValue;
-            max = initialValue;
-            sum = initialValue;
-            count = 1;
-            this.station = station;
             this.offset = offset;
-            this.hash = hash;
-            this.data = new long[0];
-            this.dataOffset = 0;
-            this.name = new String(station, 0, offset, StandardCharsets.UTF_8);
-        }
-
-        public boolean hasSameStation(byte[] stationIn, int offsetIn) {
-            return Arrays.equals(stationIn, 0, offsetIn, station, 0, offset);
-        }
-
-        public boolean hasSameStation(long[] dataIn, int offsetIn) {
-            return Arrays.equals(dataIn, 0, offsetIn, data, 0, dataOffset);
+            this.data = data;
         }
 
         public void add(int value) {
@@ -139,8 +132,7 @@ public void add(int value) {
             count++;
         }
 
-        public void merge(MeasurementAggregator other) {
-            // System.out.println("min=" +min + " other min=" +other.min);
+        public void merge(MeasurementAggregatorVectorized other) {
             min = Math.min(min, other.min);
             max = Math.max(max, other.max);
             sum += other.sum;
@@ -149,7 +141,7 @@ public void merge(MeasurementAggregator other) {
 
         @Override
         public String toString() {
-            return round((double) min / 10) + "/" + round((sum / (double) count) / 10) + "/" + round((double) max / 10);
+            return round(min / 10.) + "/" + round(sum / (double) (10 * count)) + "/" + round(max / 10.);
         }
 
         private double round(double value) {
@@ -164,116 +156,141 @@ public int getHash() {
             return hash;
         }
 
-        public String getName() {
-            return name;
+        public int getLen() {
+            return len;
+        }
+
+        public boolean dataEquals(byte[] data, int offset) {
+            return Arrays.equals(this.data, this.offset, this.offset + len, data, offset, offset + len);
+
         }
 
-        public byte[] getStation() {
-            return station;
+        public String getName() {
+            return new String(data, offset, len, StandardCharsets.UTF_8);
         }
 
         public int getOffset() {
             return offset;
         }
 
-        public long[] getData() {
+        public byte[] getData() {
             return data;
         }
-
     }
 
-    public static class MeasurementList {
-
+    private static class MeasurementListVectorized {
         private static final int SIZE = 1024 * 64;
-        private final MeasurementAggregator[] measurements = new MeasurementAggregator[SIZE];
+        private final MeasurementAggregatorVectorized[] measurements = new MeasurementAggregatorVectorized[SIZE];
+        private final byte[] keyData = new byte[SIZE * KEY_SIZE];
 
-        public void add(byte[] station, int offset, int hash, int value) {
+        private final MemorySegment dataSegment = MemorySegment.ofArray(keyData);
+
+        public void addWithByteVector(ByteVector chunk1, int len, int hash, int value, MemorySegment memorySegment, long offset) {
             int index = hash & (SIZE - 1);
-            if (measurements[index] == null) {
-                measurements[index] = new MeasurementAggregator(station.clone(), offset, hash, value);
-            }
-            else {
-                if (measurements[index].hasSameStation(station, offset)) {
-                    measurements[index].add(value);
-                }
-                else {
-                    while (measurements[index] != null && !measurements[index].hasSameStation(station, offset)) {
-                        index = (index + 1) & (SIZE - 1);
+            int i = 0;
+            while (measurements[index] != null) {
+                if (measurements[index].getLen() == len && measurements[index].getHash() == hash) {
+                    var nodeKey = ByteVector.fromArray(BYTE_SPECIES, keyData, index * KEY_SIZE);
+                    long eqMask = chunk1.compare(VectorOperators.EQ, nodeKey).toLong();
+                    long validMask = -1L >>> (64 - len);
+                    if ((eqMask & validMask) == validMask) {
+                        measurements[index].add(value);
+                        return;
                     }
-                    if (measurements[index] == null) {
-                        measurements[index] = new MeasurementAggregator(station.clone(), offset, hash, value);
+                }
+                index = (index + 1) & (SIZE - 1);
+            }
+            MemorySegment.copy(memorySegment, offset, dataSegment, (long) index * KEY_SIZE, len);
+            measurements[index] = new MeasurementAggregatorVectorized(keyData, index * KEY_SIZE, len, hash, value);
+        }
+
+        public void add(int len, int hash, int value, MemorySegment memorySegment, long offset) {
+            int index = hash & (SIZE - 1);
+            while (measurements[index] != null) {
+                if (measurements[index].getLen() == len && measurements[index].getHash() == hash) {
+                    int i = 0;
+                    while (i < len && keyData[index * KEY_SIZE + i] == memorySegment.get(ValueLayout.JAVA_BYTE, offset + i)) {
+                        i++;
                     }
-                    else {
+                    if (i == len) {
                         measurements[index].add(value);
+                        return;
                     }
                 }
+                index = (index + 1) & (SIZE - 1);
             }
+            MemorySegment.copy(memorySegment, offset, dataSegment, (long) index * KEY_SIZE, len);
+            measurements[index] = new MeasurementAggregatorVectorized(keyData, index * KEY_SIZE, len, hash, value);
         }
 
-        public void merge(MeasurementAggregator measurementAggregator) {
-            int index = (measurementAggregator.getHash() & (SIZE - 1));
-            if (measurements[index] == null) {
-                measurements[index] = measurementAggregator;
-            }
-            else {
-                while (measurements[index] != null && !measurements[index].hasSameStation(measurementAggregator.getStation(), measurementAggregator.getOffset())) {
-                    index = (index + 1) & (SIZE - 1);
-                }
-                if (measurements[index] == null) {
-                    measurements[index] = measurementAggregator;
-                }
-                else {
-                    measurements[index].merge(measurementAggregator);
+        public void merge(MeasurementAggregatorVectorized measurementAggregator) {
+            int index = measurementAggregator.getHash() & (SIZE - 1);
+            while (measurements[index] != null) {
+                if (measurements[index].getLen() == measurementAggregator.getLen() && measurements[index].getHash() == measurementAggregator.getHash()) {
+                    if (measurementAggregator.dataEquals(measurements[index].getData(), measurements[index].getOffset())) {
+                        measurements[index].merge(measurementAggregator);
+                        return;
+                    }
                 }
+                index = (index + 1) & (SIZE - 1);
             }
+            measurements[index] = measurementAggregator;
         }
 
-        public MeasurementAggregator[] getMeasurements() {
+        public MeasurementAggregatorVectorized[] getMeasurements() {
             return measurements;
         }
+
     }
 
-    public static class MMapReader {
-        private final Path path;
-        private final PartitionBoundary[] boundaries;
+    private static class MMapReaderMemorySegment {
 
+        private final Path path;
+        private final List<PartitionBoundary> boundaries;
         private final boolean serial;
+        private static final byte SEPARATOR = ';';
+        ByteVector separators = ByteVector.broadcast(BYTE_SPECIES, SEPARATOR);
+        private static final ValueLayout.OfLong JAVA_LONG_LT = ValueLayout.JAVA_LONG_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
 
-        public MMapReader(Path path, PartitionCalculator partitionCalculator, boolean serial) {
+        public MMapReaderMemorySegment(Path path, PartitionCalculator partitionCalculator, boolean serial) {
             this.path = path;
             this.serial = serial;
             boundaries = partitionCalculator.computePartitionsBoundaries(path);
         }
 
-        public TreeMap<String, MeasurementAggregator> elaborate() {
-            try (ExecutorService executor = Executors.newFixedThreadPool(boundaries.length)) {
-                List<Future<MeasurementList>> futures = new ArrayList<>();
+        public TreeMap<String, MeasurementAggregatorVectorized> elaborate() throws IOException {
+            try (ExecutorService executor = Executors.newFixedThreadPool(boundaries.size());
+                    FileChannel fileChannel = (FileChannel) Files.newByteChannel((path), StandardOpenOption.READ);
+                    var arena = Arena.ofShared()) {
+
+                List<Future<MeasurementListVectorized>> futures = new ArrayList<>();
                 for (PartitionBoundary boundary : boundaries) {
                     if (serial) {
-                        FutureTask<MeasurementList> future = new FutureTask<>(() -> computeListForPartition(boundary.start(), boundary.end()));
+                        FutureTask<MeasurementListVectorized> future = new FutureTask<>(() -> computeListForPartition(
+                                fileChannel, boundary));
                         future.run();
-                        // System.out.println("done with partition " + boundary);
                         futures.add(future);
                     }
                     else {
-                        Future<MeasurementList> future = executor.submit(() -> computeListForPartition(boundary.start(), boundary.end()));
+                        Future<MeasurementListVectorized> future = executor.submit(() -> computeListForPartition(
+                                fileChannel, boundary));
                         futures.add(future);
                     }
                 }
-                TreeMap<String, MeasurementAggregator> ris = reduce(futures);
+                TreeMap<String, MeasurementAggregatorVectorized> ris = reduce(futures);
                 return ris;
             }
         }
 
-        private TreeMap<String, MeasurementAggregator> reduce(List<Future<MeasurementList>> futures) {
+        private TreeMap<String, MeasurementAggregatorVectorized> reduce(List<Future<MeasurementListVectorized>> futures) {
             try {
-                TreeMap<String, MeasurementAggregator> risMap = new TreeMap<>();
-                MeasurementList ris = new MeasurementList();
-                for (Future<MeasurementList> future : futures) {
-                    MeasurementList results = future.get();
+                TreeMap<String, MeasurementAggregatorVectorized> risMap = new TreeMap<>();
+                MeasurementListVectorized ris = new MeasurementListVectorized();
+                for (Future<MeasurementListVectorized> future : futures) {
+                    MeasurementListVectorized results = future.get();
                     merge(ris, results);
                 }
-                for (MeasurementAggregator m : ris.getMeasurements()) {
+                for (MeasurementAggregatorVectorized m : ris.getMeasurements()) {
                     if (m != null) {
                         risMap.put(m.getName(), m);
                     }
@@ -286,101 +303,134 @@ private TreeMap<String, MeasurementAggregator> reduce(List<Future<MeasurementLis
             }
         }
 
-        private void merge(MeasurementList result, MeasurementList partial) {
-            for (MeasurementAggregator m : partial.getMeasurements()) {
+        private void merge(MeasurementListVectorized result, MeasurementListVectorized partial) {
+            for (MeasurementAggregatorVectorized m : partial.getMeasurements()) {
                 if (m != null) {
                     result.merge(m);
                 }
             }
         }
 
-        private MeasurementList computeListForPartition(long start, long end) {
-            MeasurementList list = new MeasurementList();
-            try {
-                try (FileChannel fileChannel = (FileChannel) Files.newByteChannel((path), StandardOpenOption.READ)) {
-                    MappedByteBuffer mappedByteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, end - start);
-                    mappedByteBuffer.order(BYTE_ORDER.LITTLE_ENDIAN);
-                    int limit = mappedByteBuffer.limit();
-                    int startLine;
-                    byte[] stationb = new byte[100];
-                    while ((startLine = mappedByteBuffer.position()) < limit - 110) {
-                        int currentPosition = startLine;
-                        byte b = 0;
-                        int i = 0;
-                        int hash = 0;
-
-                        while ((b = mappedByteBuffer.get(currentPosition++)) != ';') {
-                            stationb[i++] = b;
-                            hash = 31 * hash + b;
+        private MeasurementListVectorized computeListForPartition(FileChannel fileChannel, PartitionBoundary boundary) {
+            try (var arena = Arena.ofConfined()) {
+                var memorySegment = fileChannel.map(FileChannel.MapMode.READ_ONLY, boundary.start(), boundary.end() - boundary.start(), arena);
+                MeasurementListVectorized list = new MeasurementListVectorized();
+                long size = memorySegment.byteSize();
+                long offset = 0;
+                long safe = size - KEY_SIZE;
+                // ByteBuffer byteBuffer = memorySegment.asByteBuffer();
+                // byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+                ByteVector chunk1 = ByteVector.zero(BYTE_SPECIES);
+                ByteVector chunk2 = ByteVector.zero(BYTE_SPECIES);
+                while (offset < safe) {
+                    int len = 0;
+                    chunk1 = ByteVector.fromMemorySegment(BYTE_SPECIES, memorySegment, offset, NATIVE_ORDER);
+                    int equals = chunk1.compare(VectorOperators.EQ, separators).firstTrue();
+                    len += equals;
+                    if (equals == BYTE_SPECIES_LANES) {
+                        while (memorySegment.get(ValueLayout.JAVA_BYTE, offset + len) != ';') {
+                            len++;
                         }
-                        if (hash < 0) {
-                            hash = -hash;
-                        }
-
-                        long numberWord = mappedByteBuffer.getLong(currentPosition);
-                        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
-                        int value = convertIntoNumber(decimalSepPos, numberWord);
-                        mappedByteBuffer.position(currentPosition + (decimalSepPos >>> 3) + 3);
-
-                        list.add(stationb, i, hash, value);
+                    }
 
+                    int hash = hash(memorySegment, offset, len);
+                    long prevOffset = offset;
+                    offset += len + 1;
+
+                    long numberWord = memorySegment.get(JAVA_LONG_LT, offset);
+                    int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                    int value = convertIntoNumber(decimalSepPos, numberWord);
+                    offset += (decimalSepPos >>> 3) + 3;
+                    // System.out.println("Value=" + value);
+                    if (len < BYTE_SPECIES_LANES) {
+                        list.addWithByteVector(chunk1, len, hash, value, memorySegment, prevOffset);
                     }
-                    while ((startLine = mappedByteBuffer.position()) < limit) {
-                        int currentPosition = startLine;
-                        byte b = 0;
-                        int i = 0;
-                        int hash = 0;
-                        while ((b = mappedByteBuffer.get(currentPosition++)) != ';') {
-                            stationb[i++] = b;
-                            hash = 31 * hash + b;
-                        }
-                        if (hash < 0) {
-                            hash = -hash;
+                    else {
+                        list.add(len, hash, value, memorySegment, prevOffset);
+                    }
+                }
+
+                while (offset < size) {
+                    int len = 0;
+                    int equals = BYTE_SPECIES_LANES;
+                    if (offset + BYTE_SPECIES_LANES < size) {
+                        chunk1 = ByteVector.fromMemorySegment(BYTE_SPECIES, memorySegment, offset, NATIVE_ORDER);
+                        equals = chunk1.compare(VectorOperators.EQ, separators).firstTrue();
+                        len += equals;
+                        if (equals == BYTE_SPECIES_LANES) {
+                            while (memorySegment.get(ValueLayout.JAVA_BYTE, offset + len) != ';') {
+                                len++;
+                            }
                         }
+                    }
+                    else {
+                        byte[] bytes = new byte[BYTE_SPECIES_LANES];
+                        MemorySegment.copy(memorySegment, offset + len, MemorySegment.ofArray(bytes), 0, (size - offset - len));
+                        // byteBuffer.get(offset + len, bytes, 0, (int) (size - offset - len));
+                        chunk1 = ByteVector.fromArray(BYTE_SPECIES, bytes, 0);
+                        equals = chunk1.compare(VectorOperators.EQ, separators).firstTrue();
+                        len += equals;
+                    }
+                    int hash = hash(memorySegment, offset, len);
+                    long prevOffset = offset;
+                    offset += len + 1;
 
-                        int value = 0;
-                        if (currentPosition <= limit - 8) {
-                            long numberWord = mappedByteBuffer.getLong(currentPosition);
-                            int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
-                            value = convertIntoNumber(decimalSepPos, numberWord);
-                            mappedByteBuffer.position(currentPosition + (decimalSepPos >>> 3) + 3);
+                    int value = 0;
+                    if (offset < size - 8) {
+                        long numberWord = memorySegment.get(JAVA_LONG_LT, offset);
+                        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                        value = convertIntoNumber(decimalSepPos, numberWord);
+                        offset += (decimalSepPos >>> 3) + 3;
+                    }
+                    else {
+                        long currentPosition = offset;
+                        int sign = 1;
+                        byte b = memorySegment.get(ValueLayout.JAVA_BYTE, currentPosition++);
+                        if (b == '-') {
+                            sign = -1;
                         }
                         else {
-                            int sign = 1;
-                            b = mappedByteBuffer.get(currentPosition++);
-                            if (b == '-') {
-                                sign = -1;
-                            }
-                            else {
-                                value = b - '0';
-                            }
-                            while ((b = mappedByteBuffer.get(currentPosition++)) != '.') {
-                                value = value * 10 + (b - '0');
-                            }
-                            b = mappedByteBuffer.get(currentPosition);
+                            value = b - '0';
+                        }
+                        while ((b = memorySegment.get(ValueLayout.JAVA_BYTE, currentPosition++)) != '.') {
                             value = value * 10 + (b - '0');
-                            if (sign == -1) {
-                                value = -value;
-                            }
-                            mappedByteBuffer.position(currentPosition + 2);
                         }
-
-                        list.add(stationb, i, hash, value);
+                        b = memorySegment.get(ValueLayout.JAVA_BYTE, currentPosition);
+                        value = value * 10 + (b - '0');
+                        if (sign == -1) {
+                            value = -value;
+                        }
+                        offset = currentPosition + 2;
+                    }
+                    if (len < BYTE_SPECIES_LANES) {
+                        list.addWithByteVector(chunk1, len, hash, value, memorySegment, prevOffset);
+                    }
+                    else {
+                        list.add(len, hash, value, memorySegment, prevOffset);
                     }
                 }
+                return list;
             }
             catch (IOException e) {
-                System.out.println("Error");
-                System.err.println(e);
+                throw new RuntimeException(e);
             }
-            return list;
         }
 
-        private static final ByteOrder BYTE_ORDER = ByteOrder.nativeOrder();
+        private static final int GOLDEN_RATIO = 0x9E3779B9;
+        private static final int HASH_LROTATE = 5;
 
-        private static long getLongLittleEndian(long value) {
-            value = Long.reverseBytes(value);
-            return value;
+        private static int hash(MemorySegment memorySegment, long start, int len) {
+            int x;
+            int y;
+            if (len >= Integer.BYTES) {
+                x = memorySegment.get(ValueLayout.JAVA_INT_UNALIGNED, start);
+                y = memorySegment.get(ValueLayout.JAVA_INT_UNALIGNED, start + len - Integer.BYTES);
+            }
+            else {
+                x = memorySegment.get(ValueLayout.JAVA_BYTE, start);
+                y = memorySegment.get(ValueLayout.JAVA_BYTE, start + len - Byte.BYTES);
+            }
+            return (Integer.rotateLeft(x * GOLDEN_RATIO, HASH_LROTATE) ^ y) * GOLDEN_RATIO;
         }
 
         private static int convertIntoNumber(int decimalSepPos, long numberWord) {
@@ -405,16 +455,11 @@ private static int convertIntoNumber(int decimalSepPos, long numberWord) {
             return (int) value;
         }
 
-        private static long[] masks = new long[]{ 0x0000000000000000, 0xFF00000000000000L, 0xFFFF000000000000L,
-                0xFFFFFF0000000000L, 0xFFFFFFFF00000000L, 0xFFFFFFFFFF000000L, 0xFFFFFFFFFF0000L, 0xFFFFFFFFFFFF00L };
-
     }
 
     public static void main(String[] args) throws IOException {
-        long start = System.currentTimeMillis();
-        MMapReader reader = new MMapReader(Paths.get(FILE), new ProcessorPartitionCalculator(), false);
-        Map<String, MeasurementAggregator> measurements = reader.elaborate();
-        // System.out.println("ela=" + (System.currentTimeMillis() - start));
+        MMapReaderMemorySegment reader = new MMapReaderMemorySegment(Paths.get(FILE), new ProcessorPartitionCalculator(), false);
+        Map<String, MeasurementAggregatorVectorized> measurements = reader.elaborate();
         System.out.println(measurements);
 
     }
diff --git a/src/main/java/dev/morling/onebrc/CreateMeasurements3.java b/src/main/java/dev/morling/onebrc/CreateMeasurements3.java
index 804b83ca9..9bcc16dfb 100644
--- a/src/main/java/dev/morling/onebrc/CreateMeasurements3.java
+++ b/src/main/java/dev/morling/onebrc/CreateMeasurements3.java
@@ -55,7 +55,7 @@ public static void main(String[] args) throws Exception {
                 out.write(station.name);
                 out.write(';');
                 out.write(Double.toString(Math.round(temp * 10.0) / 10.0));
-                out.newLine();
+                out.write('\n');
                 if (i % 50_000_000 == 0) {
                     System.out.printf("Wrote %,d measurements in %,d ms%n", i, System.currentTimeMillis() - start);
                 }

From 3ed80c4b5f98b90a9ad871233638ef25a1c84442 Mon Sep 17 00:00:00 2001
From: Jaime Polidura <73758994+JaimePolidura@users.noreply.github.com>
Date: Sun, 28 Jan 2024 23:31:03 +0100
Subject: [PATCH 188/268] Thank you for the challange!! (#599)

* added code

* Fixed pointers bugs

* removed my own benchmark

* added comment on how I handle hash collisions

* executed mwvn clean verify

* made scripts executable & fixed rounding issues

* Fixed way of dealing with hash collisions

* changed method name sameNameBytes to isSameNameBytes

* changes script from sh to bash

* fixed chunking bug

* Fixed bug in chunking when file size is too small

* added Runtime.getRuntime().availableProcessors
---
 calculate_average_JaimePolidura.sh            |  26 ++
 prepare_JaimePolidura.sh                      |  24 ++
 .../CalculateAverage_JaimePolidura.java       | 398 ++++++++++++++++++
 3 files changed, 448 insertions(+)
 create mode 100755 calculate_average_JaimePolidura.sh
 create mode 100755 prepare_JaimePolidura.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java

diff --git a/calculate_average_JaimePolidura.sh b/calculate_average_JaimePolidura.sh
new file mode 100755
index 000000000..dfd890848
--- /dev/null
+++ b/calculate_average_JaimePolidura.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+if [ -f target/CalculateAverage_JaimePolidura_image ]; then
+	target/CalculateAverage_JaimePolidura_image
+else
+	echo "Native image not found. Running in JVM mode"
+	JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -XX:+UseTransparentHugePages -XX:+TrustFinalNonStaticFields"
+	java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_JaimePolidura
+fi
+
+
diff --git a/prepare_JaimePolidura.sh b/prepare_JaimePolidura.sh
new file mode 100755
index 000000000..8c4e0e040
--- /dev/null
+++ b/prepare_JaimePolidura.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_JaimePolidura_image ]; then
+	OPTS="--gc=epsilon -O3 --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_JaimePolidura"
+	native-image $OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_JaimePolidura_image dev.morling.onebrc.CalculateAverage_JaimePolidura
+fi	
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java
new file mode 100644
index 000000000..3980a2c38
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java
@@ -0,0 +1,398 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.util.Map;
+import java.util.TreeMap;
+
+public final class CalculateAverage_JaimePolidura {
+    private static final String FILE = "./measurements.txt";
+    private static final Unsafe UNSAFE = initUnsafe();
+    private static final long SEMICOLON_PATTERN = 0X3B3B3B3B3B3B3B3BL;
+
+    private static Unsafe initUnsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        Worker[] workers = createWorkers();
+
+        startWorkers(workers);
+        joinWorkers(workers);
+
+        Map<String, Result> results = mergeWorkersResults(workers);
+        printResults(results);
+    }
+
+    private static void joinWorkers(Worker[] workers) throws InterruptedException {
+        for (int i = 0; i < workers.length; i++) {
+            workers[i].join();
+        }
+    }
+
+    private static void startWorkers(Worker[] workers) {
+        for (int i = 0; i < workers.length; i++) {
+            workers[i].start();
+        }
+    }
+
+    private static Worker[] createWorkers() throws Exception {
+        FileChannel channel = new RandomAccessFile(FILE, "r").getChannel();
+        MemorySegment mmappedFile = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), Arena.global());
+
+        int nWorkers = channel.size() > 1024 * 1024 ? Runtime.getRuntime().availableProcessors() : 1;
+        Worker[] workers = new Worker[nWorkers];
+        long quantityPerWorker = Math.floorDiv(channel.size(), nWorkers);
+        long quantityLastWorker = quantityPerWorker + (channel.size() % nWorkers);
+
+        for (int i = 0; i < nWorkers; i++) {
+            boolean isLastWorker = i == nWorkers - 1;
+
+            long startAddr = mmappedFile.address() + quantityPerWorker * i;
+            long endAddr = startAddr + (isLastWorker ? quantityLastWorker : quantityPerWorker);
+            workers[i] = new Worker(mmappedFile, channel.size(), startAddr, endAddr);
+            workers[i].setPriority(Thread.MAX_PRIORITY);
+        }
+
+        return workers;
+    }
+
+    private static Map<String, Result> mergeWorkersResults(Worker[] workers) {
+        Map<String, Result> mergedResults = new TreeMap<>();
+
+        for (int i = 0; i < workers.length; i++) {
+            Worker worker = workers[i];
+
+            for (Result entry : worker.results.entries) {
+                if (entry != null) {
+                    String name = new String(entry.name, 0, entry.nameLength);
+                    Result alreadyExistingResult = mergedResults.get(name);
+                    if (alreadyExistingResult != null) {
+                        alreadyExistingResult.min = Math.min(alreadyExistingResult.min, entry.min);
+                        alreadyExistingResult.max = Math.max(alreadyExistingResult.max, entry.max);
+                        alreadyExistingResult.count = alreadyExistingResult.count + entry.count;
+                        alreadyExistingResult.sum = alreadyExistingResult.sum + entry.sum;
+                    }
+                    else {
+                        mergedResults.put(name, entry);
+                    }
+                }
+            }
+        }
+
+        return mergedResults;
+    }
+
+    private static void printResults(Map<String, Result> results) {
+        StringBuilder stringBuilder = new StringBuilder(results.size() * 32);
+        stringBuilder.append('{');
+
+        for (Map.Entry<String, Result> entry : results.entrySet()) {
+            if (stringBuilder.length() > 1) {
+                stringBuilder.append(", ");
+            }
+
+            Result result = entry.getValue();
+            stringBuilder.append(entry.getKey())
+                    .append('=')
+                    .append(round(((double) result.min) / 10.0))
+                    .append('/')
+                    .append(round((double) result.sum / (result.count * 10)))
+                    .append('/')
+                    .append(round(((double) result.max) / 10.0d));
+
+        }
+
+        stringBuilder.append('}');
+
+        System.out.println(stringBuilder);
+    }
+
+    static class Worker extends Thread {
+        private final byte[] lastParsedNameBytes = new byte[100];
+        private int lastParsedNameLength;
+        private long lastParsedNameHash;
+        private int lastParsedTemperature;
+
+        private final SimpleMap results;
+        private final MemorySegment mmappedFile;
+        private final long mmappedFileSize;
+        private long currentAddr; // Will point to beginning of string
+        private long endAddr; // Will point to \n
+
+        public Worker(MemorySegment mmappedFile, long mmappedFileSize, long startAddr, long endAddr) {
+            super("Worker[" + startAddr + ", " + endAddr + "]");
+
+            this.mmappedFileSize = mmappedFileSize;
+            this.mmappedFile = mmappedFile;
+            this.currentAddr = startAddr;
+            this.endAddr = endAddr;
+
+            this.results = new SimpleMap(roundUpToPowerOfTwo(1 << 16)); // 2^16
+        }
+
+        @Override
+        public void run() {
+            adjustStartAddr();
+            adjustEndAddr();
+
+            if (this.currentAddr >= endAddr) {
+                return;
+            }
+
+            while (currentAddr < endAddr) {
+                parseName();
+                parseTemperature();
+
+                this.currentAddr++; // We don't want it to point to \n
+
+                results.put(this.lastParsedNameHash, this.lastParsedNameBytes, this.lastParsedNameLength, this.lastParsedTemperature);
+            }
+        }
+
+        // Idea from Quan Anh Mai's implementation
+        private void parseTemperature() {
+            long numberWord = UNSAFE.getLong(currentAddr);
+
+            // The 4th binary digit of the ascii (Starting from left) of a digit is 1 while '.' is 0
+            int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+            // 28 = 4 + 8 * 3 (4 bytes is the number of tail zeros in the byte of decimalPos)
+            // xxxn.nn- shift: 28 - 28 = 0
+            // xxxxxn.n shift: 28 - 12 = 16
+            // xxxxn.nn shift: 28 - 20 = 8
+            int shift = 28 - decimalSepPos;
+
+            // Negative in ASCII: 00101101 2D. In ascii every digit starts with hex digit 3
+            // So in order to know if a number is positive, we simpy need the first bit of the 2º half
+            // If signed is 0 the number is positive. If it is negative signed will be -1.
+            long signed = (~numberWord << 59) >> 63;
+
+            // If signed is 0 (positive), designMask will be 0xFFFFFFFFFFFFFFFF (-256)
+            // If signed is -1, all 1s (negative), designMask will be 0xFFFFFFFFFFFFFF00 (-1)
+            long designMask = ~(signed & 0xFF);
+
+            // Align the number to a fixed position
+            // (x represents any non-related character, _ represents 0x00, n represents the actual digit and - negative)
+            // xxxn.nn- -> xxxn.nn-
+            // xxxxxn.n -> xxxn.n__
+            // xxxxn.nn -> xxxn.nn_
+            long numberAligned = (numberWord & designMask) << shift;
+
+            // We convert ascii representation to number value
+            long numberConvertedFromAscii = numberAligned & 0x0F000F0F00L;
+
+            // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+            // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+            // 0x000000UU00TTHH00 +
+            // 0x00UU00TTHH000000 * 10 +
+            // 0xUU00TTHH00000000 * 100
+            // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
+            // This results in our value lies in the bit 32 to 41 of this product
+            // That was close :)
+            long absValue = ((numberConvertedFromAscii * 0x640a0001) >>> 32) & 0x3FF;
+
+            long signedValue = (absValue ^ signed) - signed;
+
+            this.currentAddr += (((decimalSepPos - 4) / 8) + 2);
+
+            this.lastParsedTemperature = (int) signedValue;
+        }
+
+        // I first saw this idea in Artsiom Korzun's implementation
+        private void parseName() {
+            this.lastParsedNameHash = 0;
+
+            long totalWordHash = 0;
+            int totalWordLength = 0;
+
+            for (;;) {
+                long actualWord = UNSAFE.getLong(currentAddr + totalWordLength);
+                long hasSemicolon = hasByte(actualWord, SEMICOLON_PATTERN);
+
+                if (hasSemicolon != 0) {
+                    int actualLength = Long.numberOfTrailingZeros(hasSemicolon) >> 3;
+                    if (actualLength == 0) {
+                        actualWord = 0;
+                    }
+
+                    actualWord = mask(actualWord, actualLength);
+
+                    UNSAFE.putLong(this.lastParsedNameBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET + totalWordLength, actualWord);
+
+                    totalWordHash ^= actualWord;
+                    totalWordLength += actualLength;
+
+                    this.lastParsedNameLength = totalWordLength;
+                    this.lastParsedNameHash = totalWordHash;
+                    this.currentAddr += totalWordLength + 1; // +1 Because we don't want to point to ';'
+
+                    break;
+                }
+                else {
+                    UNSAFE.putLong(this.lastParsedNameBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET + totalWordLength, actualWord);
+
+                    totalWordLength += 8;
+                    totalWordHash ^= actualWord;
+                }
+            }
+        }
+
+        // Removes "garbage" of a word byte
+        private long mask(long word, int length) {
+            int shift = (8 - length) * 8;
+            return (word << shift) >> shift;
+        }
+
+        private long hasByte(long word, long pattern) {
+            long patternMatch = word ^ pattern;
+            return (patternMatch - 0x0101010101010101L) & (~patternMatch & 0x8080808080808080L);
+        }
+
+        private void adjustStartAddr() {
+            if (currentAddr == this.mmappedFile.address()) {
+                return;
+            }
+
+            while (UNSAFE.getByte(currentAddr) != '\n' && currentAddr != endAddr) {
+                currentAddr++;
+            }
+
+            currentAddr++; // We want it to point to the first character instead of \n
+        }
+
+        private void adjustEndAddr() {
+            long endAddressMmappedFile = mmappedFile.address() + mmappedFileSize;
+            if (endAddr >= endAddressMmappedFile) {
+                return;
+            }
+
+            while (UNSAFE.getByte(endAddr) != '\n' && endAddr != endAddressMmappedFile) {
+                endAddr++;
+            }
+        }
+    }
+
+    static class SimpleMap {
+        private final Result[] entries;
+        private final long size;
+
+        public SimpleMap(int size) {
+            this.entries = new Result[size];
+            this.size = size;
+        }
+
+        public void put(long hashToPut, byte[] nameToPut, int nameLength, int valueToPut) {
+            int index = hashToIndex(hashToPut);
+
+            for (;;) {
+                Result actualEntry = entries[index];
+
+                if (actualEntry == null) {
+                    byte[] nameToPutCopy = new byte[nameLength];
+                    for (int i = 0; i < nameLength; i++) {
+                        nameToPutCopy[i] = nameToPut[i];
+                    }
+
+                    entries[index] = new Result(hashToPut, nameToPutCopy, nameLength, valueToPut,
+                            valueToPut, valueToPut, 1);
+                    return;
+                }
+                if (actualEntry.isSameName(nameToPut, nameLength)) {
+                    actualEntry.min = Math.min(actualEntry.min, valueToPut);
+                    actualEntry.max = Math.max(actualEntry.max, valueToPut);
+                    actualEntry.count++;
+                    actualEntry.sum = actualEntry.sum + valueToPut;
+                    return;
+                }
+                // If the name is not the same, we try to go to the next slot
+                if (++index >= this.size) {
+                    index = 0;
+                }
+            }
+        }
+
+        private int hashToIndex(long hash) {
+            return (int) (((hash >> 32) ^ ((int) hash)) & (this.size - 1));
+        }
+    }
+
+    static class Result {
+        public byte[] name;
+        public int nameLength;
+        public int max;
+        public int min;
+        public int sum;
+        public int count;
+        public long hash;
+
+        public Result(long hash, byte[] name, int nameLength, int max, int min, int sum, int occ) {
+            this.nameLength = nameLength;
+            this.count = occ;
+            this.hash = hash;
+            this.name = name;
+            this.max = max;
+            this.min = min;
+            this.sum = sum;
+        }
+
+        public boolean isSameName(byte[] otherNameBytes, int otherNameLength) {
+            return this.nameLength == otherNameLength && isSameNameBytes(otherNameBytes);
+        }
+
+        private boolean isSameNameBytes(byte[] otherNameBytes) {
+            for (int i = 0; i < this.nameLength; i++) {
+                if (this.name[i] != otherNameBytes[i]) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+    }
+
+    private static double round(double value) {
+        return Math.round(value * 10.0) / 10.0;
+    }
+
+    private static int roundUpToPowerOfTwo(int number) {
+        if (number <= 0) {
+            return 1;
+        }
+
+        number--;
+        number |= number >> 1;
+        number |= number >> 2;
+        number |= number >> 4;
+        number |= number >> 8;
+        number |= number >> 16;
+
+        return number + 1;
+    }
+}

From 2b55f0dd6b7ea0a40d8666c7e52691ef23f612ce Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Sun, 28 Jan 2024 23:43:07 +0100
Subject: [PATCH 189/268] Leaderboard update

---
 README.md                          | 17 +++++++++--------
 calculate_average_giovannicuccu.sh |  0
 prepare_giovannicuccu.sh           |  0
 3 files changed, 9 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 calculate_average_giovannicuccu.sh
 mode change 100644 => 100755 prepare_giovannicuccu.sh

diff --git a/README.md b/README.md
index 4b1113776..2636b80c7 100644
--- a/README.md
+++ b/README.md
@@ -42,11 +42,11 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:01.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 2 | 00:02.019 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3 | 00:02.091 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-|   | 00:02.149 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+| 2 | 00:01.990 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.081 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+| 3* | 00:02.091 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-|   | 00:02.512 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+|   | 00:02.440 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
@@ -65,18 +65,20 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
 |   | 00:04.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
+|   | 00:04.719 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
+|   | 00:04.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
+|   | 00:05.069 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
-|   | 00:05.387 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
+|   | 00:05.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
 |   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:05.709 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
-|   | 00:05.850 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:05.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |
@@ -88,7 +90,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:06.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java)| 21.0.1-graal | [rcasteltrione](https://github.com/rcasteltrione) |  |
-|   | 00:07.240 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [giovannicuccu](https://github.com/giovannicuccu) |  |
 |   | 00:07.563 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java)| 21.0.1-graal | [3j5a](https://github.com/3j5a) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
@@ -126,9 +127,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
+|   | 00:12.736 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java)| 21.0.1-open | [Andrzej Nestoruk](https://github.com/anestoruk) |  |
 |   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
-|   | 00:13.029 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java)| 21.0.1-open | [Andrzej Nestoruk](https://github.com/anestoruk) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
diff --git a/calculate_average_giovannicuccu.sh b/calculate_average_giovannicuccu.sh
old mode 100644
new mode 100755
diff --git a/prepare_giovannicuccu.sh b/prepare_giovannicuccu.sh
old mode 100644
new mode 100755

From eeed048466a0d917ae2b1c684f11d65d6ea6ad37 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 29 Jan 2024 08:55:20 +0100
Subject: [PATCH 190/268] Leaderboard update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2636b80c7..ae621bf89 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:02.440 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
+|   | 00:03.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.31-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |

From 1eaf8791c10438b7005f37fa515c7d9fec8d1c78 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 29 Jan 2024 10:00:39 +0100
Subject: [PATCH 191/268] Adding some articles

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index ae621bf89..be2ce00a9 100644
--- a/README.md
+++ b/README.md
@@ -428,6 +428,7 @@ A: It's the abbreviation of the project name: **One** **B**illion **R**ow **C**h
 
 A list of external resources such as blog posts and videos, discussing 1BRC and specific implementations:
 
+* [The One Billion Row Challenge Shows That Java Can Process a One Billion Rows File in Two Seconds ](https://www.infoq.com/news/2024/01/1brc-fast-java-processing), by Olimpiu Pop (interview)
 * [Cliff Click discussing his 1BRC solution on the Coffee Compiler Club](https://www.youtube.com/watch?v=NJNIbgV6j-Y) (video)
 * [1️⃣🐝🏎️🦆 (1BRC in SQL with DuckDB)](https://rmoff.net/2024/01/03/1%EF%B8%8F%E2%83%A3%EF%B8%8F-1brc-in-sql-with-duckdb/), by Robin Moffatt (blog post)
 * [1 billion rows challenge in PostgreSQL and ClickHouse](https://ftisiot.net/posts/1brows/), by Francesco Tisiot (blog post)
@@ -439,6 +440,8 @@ A list of external resources such as blog posts and videos, discussing 1BRC and
 * [The One Billion Row Challenge - .NET Edition](https://dev.to/mergeconflict/392-the-one-billion-row-challenge-net-edition), by Frank A. Krueger (podcast)
 * [One Billion Row Challenge](https://curiouscoding.nl/posts/1brc/), by Ragnar Groot Koerkamp (blog post)
 * [ClickHouse and The One Billion Row Challenge](https://clickhouse.com/blog/clickhouse-one-billion-row-challenge), by Dale McDiarmid (blog post)
+* [One Billion Row Challenge & Azure Data Explorer](https://nielsberglund.com/post/2024-01-28-one-billion-row-challenge--azure-data-explorer/), by Niels Berglund (blog post)
+* [One Billion Row Challenge - view from sidelines](https://www.chashnikov.dev/post/one-billion-row-challenge-view-from-sidelines), by Leo Chashnikov (blog post)
 
 ## Sponsorship
 

From 5ba094c8fded54677e787220e352a1baf74cacec Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Mon, 29 Jan 2024 20:36:25 +0100
Subject: [PATCH 192/268] loop similar to thomas (#634)

---
 .../CalculateAverage_artsiomkorzun.java       | 302 ++++++++++--------
 1 file changed, 162 insertions(+), 140 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index 2a1a387d5..c0cc8f99e 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -33,8 +33,9 @@
 public class CalculateAverage_artsiomkorzun {
 
     private static final Path FILE = Path.of("./measurements.txt");
-    private static final long SEGMENT_SIZE = 4 * 1024 * 1024;
+    private static final long SEGMENT_SIZE = 2 * 1024 * 1024;
     private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
+    private static final long LINE_PATTERN = 0x0A0A0A0A0A0A0A0AL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
 
@@ -162,14 +163,14 @@ private static double round(double v) {
         return Math.round(v) / 10.0;
     }
 
-    private record Aggregate(int min, int max, long sum, int cnt) {
+    private record Aggregate(long min, long max, long sum, long cnt) {
     }
 
     private static class Aggregates {
 
-        private static final int ENTRIES = 64 * 1024;
-        private static final int SIZE = 128 * ENTRIES;
-        private static final int MASK = (ENTRIES - 1) << 7;
+        private static final long ENTRIES = 64 * 1024;
+        private static final long SIZE = 256 * ENTRIES;
+        private static final long MASK = (ENTRIES - 1) << 8;
 
         private final long pointer;
 
@@ -179,27 +180,27 @@ public Aggregates() {
             UNSAFE.setMemory(pointer, SIZE, (byte) 0);
         }
 
-        public long find(long word, int hash) {
+        public long find(long word, long hash) {
             long address = pointer + offset(hash);
-            long w = word(address + 24);
+            long w = word(address + 48);
             return (w == word) ? address : 0;
         }
 
-        public long find(long word1, long word2, int hash) {
+        public long find(long word1, long word2, long hash) {
             long address = pointer + offset(hash);
-            long w1 = word(address + 24);
-            long w2 = word(address + 32);
+            long w1 = word(address + 48);
+            long w2 = word(address + 56);
             return (word1 == w1) && (word2 == w2) ? address : 0;
         }
 
-        public long put(long reference, long word, int length, int hash) {
-            for (int offset = offset(hash);; offset = next(offset)) {
+        public long put(long reference, long word, long length, long hash) {
+            for (long offset = offset(hash);; offset = next(offset)) {
                 long address = pointer + offset;
-                if (equal(reference, word, address + 24, length)) {
+                if (equal(reference, word, address + 48, length)) {
                     return address;
                 }
 
-                int len = UNSAFE.getInt(address);
+                long len = UNSAFE.getLong(address);
                 if (len == 0) {
                     alloc(reference, length, hash, address);
                     return address;
@@ -207,55 +208,55 @@ public long put(long reference, long word, int length, int hash) {
             }
         }
 
-        public static void update(long address, int value) {
-            long sum = UNSAFE.getLong(address + 8) + value;
-            int cnt = UNSAFE.getInt(address + 16) + 1;
-            short min = UNSAFE.getShort(address + 20);
-            short max = UNSAFE.getShort(address + 22);
+        public static void update(long address, long value) {
+            long sum = UNSAFE.getLong(address + 16) + value;
+            long cnt = UNSAFE.getLong(address + 24) + 1;
+            long min = UNSAFE.getLong(address + 32);
+            long max = UNSAFE.getLong(address + 40);
 
-            UNSAFE.putLong(address + 8, sum);
-            UNSAFE.putInt(address + 16, cnt);
+            UNSAFE.putLong(address + 16, sum);
+            UNSAFE.putLong(address + 24, cnt);
 
             if (value < min) {
-                UNSAFE.putShort(address + 20, (short) value);
+                UNSAFE.putLong(address + 32, value);
             }
 
             if (value > max) {
-                UNSAFE.putShort(address + 22, (short) value);
+                UNSAFE.putLong(address + 40, value);
             }
         }
 
         public void merge(Aggregates rights) {
-            for (int rightOffset = 0; rightOffset < SIZE; rightOffset += 128) {
+            for (int rightOffset = 0; rightOffset < SIZE; rightOffset += 256) {
                 long rightAddress = rights.pointer + rightOffset;
-                int length = UNSAFE.getInt(rightAddress);
+                long length = UNSAFE.getLong(rightAddress);
 
                 if (length == 0) {
                     continue;
                 }
 
-                int hash = UNSAFE.getInt(rightAddress + 4);
+                long hash = UNSAFE.getLong(rightAddress + 8);
 
-                for (int offset = offset(hash);; offset = next(offset)) {
+                for (long offset = offset(hash);; offset = next(offset)) {
                     long address = pointer + offset;
 
-                    if (equal(address + 24, rightAddress + 24, length)) {
-                        long sum = UNSAFE.getLong(address + 8) + UNSAFE.getLong(rightAddress + 8);
-                        int cnt = UNSAFE.getInt(address + 16) + UNSAFE.getInt(rightAddress + 16);
-                        short min = (short) Math.min(UNSAFE.getShort(address + 20), UNSAFE.getShort(rightAddress + 20));
-                        short max = (short) Math.max(UNSAFE.getShort(address + 22), UNSAFE.getShort(rightAddress + 22));
+                    if (equal(address + 48, rightAddress + 48, length)) {
+                        long sum = UNSAFE.getLong(address + 16) + UNSAFE.getLong(rightAddress + 16);
+                        long cnt = UNSAFE.getLong(address + 24) + UNSAFE.getLong(rightAddress + 24);
+                        long min = Math.min(UNSAFE.getLong(address + 32), UNSAFE.getLong(rightAddress + 32));
+                        long max = Math.max(UNSAFE.getLong(address + 40), UNSAFE.getLong(rightAddress + 40));
 
-                        UNSAFE.putLong(address + 8, sum);
-                        UNSAFE.putInt(address + 16, cnt);
-                        UNSAFE.putShort(address + 20, min);
-                        UNSAFE.putShort(address + 22, max);
+                        UNSAFE.putLong(address + 16, sum);
+                        UNSAFE.putLong(address + 24, cnt);
+                        UNSAFE.putLong(address + 32, min);
+                        UNSAFE.putLong(address + 40, max);
                         break;
                     }
 
-                    int len = UNSAFE.getInt(address);
+                    long len = UNSAFE.getLong(address);
 
                     if (len == 0) {
-                        UNSAFE.copyMemory(rightAddress, address, length + 24);
+                        UNSAFE.copyMemory(rightAddress, address, length + 48);
                         break;
                     }
                 }
@@ -265,19 +266,19 @@ public void merge(Aggregates rights) {
         public Map<String, Aggregate> aggregate() {
             TreeMap<String, Aggregate> set = new TreeMap<>();
 
-            for (int offset = 0; offset < SIZE; offset += 128) {
+            for (long offset = 0; offset < SIZE; offset += 256) {
                 long address = pointer + offset;
-                int length = UNSAFE.getInt(address);
+                long length = UNSAFE.getLong(address);
 
                 if (length != 0) {
-                    byte[] array = new byte[length - 1];
-                    UNSAFE.copyMemory(null, address + 24, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, array.length);
+                    byte[] array = new byte[(int) length - 1];
+                    UNSAFE.copyMemory(null, address + 48, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, array.length);
                     String key = new String(array);
 
-                    long sum = UNSAFE.getLong(address + 8);
-                    int cnt = UNSAFE.getInt(address + 16);
-                    short min = UNSAFE.getShort(address + 20);
-                    short max = UNSAFE.getShort(address + 22);
+                    long sum = UNSAFE.getLong(address + 16);
+                    long cnt = UNSAFE.getLong(address + 24);
+                    long min = UNSAFE.getLong(address + 32);
+                    long max = UNSAFE.getLong(address + 40);
 
                     Aggregate aggregate = new Aggregate(min, max, sum, cnt);
                     set.put(key, aggregate);
@@ -287,23 +288,23 @@ public Map<String, Aggregate> aggregate() {
             return set;
         }
 
-        private static void alloc(long reference, int length, int hash, long address) {
-            UNSAFE.putInt(address, length);
-            UNSAFE.putInt(address + 4, hash);
-            UNSAFE.putShort(address + 20, Short.MAX_VALUE);
-            UNSAFE.putShort(address + 22, Short.MIN_VALUE);
-            UNSAFE.copyMemory(reference, address + 24, length);
+        private static void alloc(long reference, long length, long hash, long address) {
+            UNSAFE.putLong(address, length);
+            UNSAFE.putLong(address + 8, hash);
+            UNSAFE.putLong(address + 32, Long.MAX_VALUE);
+            UNSAFE.putLong(address + 40, Long.MIN_VALUE);
+            UNSAFE.copyMemory(reference, address + 48, length);
         }
 
-        private static int offset(int hash) {
+        private static long offset(long hash) {
             return hash & MASK;
         }
 
-        private static int next(int prev) {
-            return (prev + 128) & (SIZE - 1);
+        private static long next(long prev) {
+            return (prev + 256) & (SIZE - 1);
         }
 
-        private static boolean equal(long leftAddress, long leftWord, long rightAddress, int length) {
+        private static boolean equal(long leftAddress, long leftWord, long rightAddress, long length) {
             while (length > 8) {
                 long left = UNSAFE.getLong(leftAddress);
                 long right = UNSAFE.getLong(rightAddress);
@@ -320,7 +321,7 @@ private static boolean equal(long leftAddress, long leftWord, long rightAddress,
             return leftWord == word(rightAddress);
         }
 
-        private static boolean equal(long leftAddress, long rightAddress, int length) {
+        private static boolean equal(long leftAddress, long rightAddress, long length) {
             do {
                 long left = UNSAFE.getLong(leftAddress);
                 long right = UNSAFE.getLong(rightAddress);
@@ -362,7 +363,7 @@ public void run() {
 
             for (int segment; (segment = counter.getAndIncrement()) < segmentCount;) {
                 long position = SEGMENT_SIZE * segment;
-                long size = Math.min(SEGMENT_SIZE, fileSize - position - 1);
+                long size = Math.min(SEGMENT_SIZE + 1, fileSize - position);
                 long start = fileAddress + position;
                 long end = start + size;
 
@@ -374,7 +375,55 @@ public void run() {
                 long left = next(start + chunk);
                 long right = next(start + chunk + chunk);
 
-                aggregate(aggregates, start, left - 1, left, right - 1, right, end);
+                Chunk chunk1 = new Chunk(start, left);
+                Chunk chunk2 = new Chunk(left, right);
+                Chunk chunk3 = new Chunk(right, end);
+
+                while (chunk1.has() && chunk2.has() && chunk3.has()) {
+                    long word1 = word(chunk1.position);
+                    long word2 = word(chunk2.position);
+                    long word3 = word(chunk3.position);
+
+                    long separator1 = separator(word1);
+                    long separator2 = separator(word2);
+                    long separator3 = separator(word3);
+
+                    long pointer1 = find(aggregates, chunk1, word1, separator1);
+                    long pointer2 = find(aggregates, chunk2, word2, separator2);
+                    long pointer3 = find(aggregates, chunk3, word3, separator3);
+
+                    long value1 = value(chunk1);
+                    long value2 = value(chunk2);
+                    long value3 = value(chunk3);
+
+                    Aggregates.update(pointer1, value1);
+                    Aggregates.update(pointer2, value2);
+                    Aggregates.update(pointer3, value3);
+                }
+
+                while (chunk1.has()) {
+                    long word1 = word(chunk1.position);
+                    long separator1 = separator(word1);
+                    long pointer1 = find(aggregates, chunk1, word1, separator1);
+                    long value1 = value(chunk1);
+                    Aggregates.update(pointer1, value1);
+                }
+
+                while (chunk2.has()) {
+                    long word2 = word(chunk2.position);
+                    long separator2 = separator(word2);
+                    long pointer2 = find(aggregates, chunk2, word2, separator2);
+                    long value2 = value(chunk2);
+                    Aggregates.update(pointer2, value2);
+                }
+
+                while (chunk3.has()) {
+                    long word3 = word(chunk3.position);
+                    long separator3 = separator(word3);
+                    long pointer3 = find(aggregates, chunk3, word3, separator3);
+                    long value3 = value(chunk3);
+                    Aggregates.update(pointer3, value3);
+                }
             }
 
             while (!result.compareAndSet(null, aggregates)) {
@@ -387,123 +436,82 @@ public void run() {
         }
 
         private static long next(long position) {
-            while (UNSAFE.getByte(position++) != '\n') {
-                // continue
-            }
-            return position;
-        }
-
-        private static void aggregate(Aggregates aggregates, long position1, long limit1, long position2, long limit2, long position3, long limit3) {
-            while (position1 <= limit1 && position2 <= limit2 && position3 <= limit3) {
-                long word1 = word(position1);
-                long word2 = word(position2);
-                long word3 = word(position3);
+            while (true) {
+                long word = word(position);
+                long match = word ^ LINE_PATTERN;
+                long line = (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
 
-                long separator1 = separator(word1);
-                long separator2 = separator(word2);
-                long separator3 = separator(word3);
-
-                position1 = process(aggregates, position1, word1, separator1);
-                position2 = process(aggregates, position2, word2, separator2);
-                position3 = process(aggregates, position3, word3, separator3);
-            }
-
-            while (position1 <= limit1) {
-                long word1 = word(position1);
-                long separator1 = separator(word1);
-                position1 = process(aggregates, position1, word1, separator1);
-            }
-
-            while (position2 <= limit2) {
-                long word2 = word(position2);
-                long separator2 = separator(word2);
-                position2 = process(aggregates, position2, word2, separator2);
-            }
+                if (line == 0) {
+                    position += 8;
+                    continue;
+                }
 
-            while (position3 <= limit3) {
-                long word3 = word(position3);
-                long separator3 = separator(word3);
-                position3 = process(aggregates, position3, word3, separator3);
+                return position + (Long.numberOfTrailingZeros(line) >>> 3) + 1;
             }
         }
 
-        private static long process(Aggregates aggregates, long position, long word, long separator) {
-            long end = position;
-
-            int length;
-            int hash;
-            int value;
+        private static long find(Aggregates aggregates, Chunk chunk, long word, long separator) {
+            long start = chunk.position;
+            long hash;
 
             if (separator != 0) {
-                length = length(separator);
                 word = mask(word, separator);
                 hash = mix(word);
-                end += length;
 
-                long num = word(end);
-                int dot = dot(num);
-                value = value(num, dot);
-                end += (dot >> 3) + 3;
+                chunk.position += length(separator);
                 long pointer = aggregates.find(word, hash);
 
                 if (pointer != 0) {
-                    Aggregates.update(pointer, value);
-                    return end;
+                    return pointer;
                 }
             }
             else {
                 long word0 = word;
-                word = word(end + 8);
+                word = word(start + 8);
                 separator = separator(word);
 
                 if (separator != 0) {
-                    length = length(separator) + 8;
                     word = mask(word, separator);
                     hash = mix(word ^ word0);
-                    end += length;
 
-                    long num = word(end);
-                    int dot = dot(num);
-                    value = value(num, dot);
-                    end += (dot >> 3) + 3;
+                    chunk.position += length(separator) + 8;
                     long pointer = aggregates.find(word0, word, hash);
 
                     if (pointer != 0) {
-                        Aggregates.update(pointer, value);
-                        return end;
+                        return pointer;
                     }
                 }
                 else {
-                    length = 16;
-                    long h = word ^ word0;
+                    chunk.position += 16;
+                    hash = word ^ word0;
 
                     while (true) {
-                        word = word(end + length);
+                        word = word(chunk.position);
                         separator = separator(word);
 
                         if (separator == 0) {
-                            length += 8;
-                            h ^= word;
+                            chunk.position += 8;
+                            hash ^= word;
                             continue;
                         }
 
-                        length += length(separator);
                         word = mask(word, separator);
-                        hash = mix(h ^ word);
-                        end += length;
-
-                        long num = word(end);
-                        int dot = dot(num);
-                        value = value(num, dot);
-                        end += (dot >> 3) + 3;
+                        hash = mix(hash ^ word);
+                        chunk.position += length(separator);
                         break;
                     }
                 }
             }
 
-            long pointer = aggregates.put(position, word, length, hash);
-            Aggregates.update(pointer, value);
-            return end;
+            long length = chunk.position - start;
+            return aggregates.put(start, word, length, hash);
+        }
+
+        private static long value(Chunk chunk) {
+            long num = word(chunk.position);
+            long dot = dot(num);
+            chunk.position += (dot >> 3) + 3;
+            return value(num, dot);
         }
 
         private static long separator(long word) {
@@ -516,28 +524,42 @@ private static long mask(long word, long separator) {
             return word & mask;
         }
 
-        private static int length(long separator) {
+        private static long length(long separator) {
             return (Long.numberOfTrailingZeros(separator) >>> 3) + 1;
         }
 
-        private static int mix(long x) {
+        private static long mix(long x) {
             long h = x * -7046029254386353131L;
             h ^= h >>> 35;
-            return (int) h;
+            return h;
             // h ^= h >>> 32;
             // return (int) (h ^ h >>> 16);
         }
 
-        private static int dot(long num) {
+        private static long dot(long num) {
             return Long.numberOfTrailingZeros(~num & DOT_BITS);
         }
 
-        private static int value(long w, int dot) {
+        private static long value(long w, long dot) {
             long signed = (~w << 59) >> 63;
             long mask = ~(signed & 0xFF);
             long digits = ((w & mask) << (28 - dot)) & 0x0F000F0F00L;
             long abs = ((digits * MAGIC_MULTIPLIER) >>> 32) & 0x3FF;
-            return (int) ((abs ^ signed) - signed);
+            return (abs ^ signed) - signed;
+        }
+    }
+
+    private static class Chunk {
+        final long limit;
+        long position;
+
+        public Chunk(long position, long limit) {
+            this.position = position;
+            this.limit = limit;
+        }
+
+        boolean has() {
+            return position < limit;
         }
     }
-}
+}
\ No newline at end of file

From 886f0cdb4df4fdbee7c99fb1fdd5c72d9608d635 Mon Sep 17 00:00:00 2001
From: Marko Topolnik <marko.topolnik@gmail.com>
Date: Mon, 29 Jan 2024 20:51:52 +0100
Subject: [PATCH 193/268] mtopolnik submission 3 (#637)

* calculate_average_mtopolnik

* short hash (just first 8 bytes of name)

* Remove unneeded checks

* Remove archiving classes

* 2x larger hashtable

* Add "set" to setters

* Simplify parsing temperature, remove newline search

* Reduce the size of the name slot

* Store name length and use to detect collision

* Reduce memory loads in parseTemperature

* Use short for min/max

* Extract constant for semicolon

* Fix script header

* Explicit bash shell in shebang

* Inline usage of broadcast semicolon

* Try vectorization

* Remove vectorization

* Go Unsafe

* Use SWAR temperature parsing by merykitty

* Inline some things

* Remove commented-out MemorySegment usage

* Inline namesMem.asSlice() invocation

* Try out JVM JIT flags

* Implement strcmp

* Remove unused instance variables

* Optimize hashing

* Put station name into hashtable

* Reorder method

* Remove usage of MemorySegment.getUtf8String

Replace with UNSAFE.copyMemory() and new String()

* Fix hashing bug

* Remove outdated comments

* Fix informative constants

* Use broadcastByte() more

* Improve method naming

* More hashing

* Revert more hashing

* Add commented-out code to hash 16 bytes

* Slight cleanup

* Align hashtable at cacheline boundary

* Add Graal Native image

* Revert Graal Native image

This reverts commit d916a42326d89bd1a841bbbecfae185adb8679d7.

* Simplify shell script (no SDK selection)

* Move a constant, zero out hashtable on start

* Better name comparison

* Add prepare_mtopolnik.sh

* Cleaner idiom in name comparison

* AND instead of MOD for hashtable indexing

* Improve word masking code

* Fix formatting

* Reduce memory loads

* Remove endianness checks

* Avoid hash == 0 problem

* Fix subtle bug

* MergeSort of parellel results

* Touch up perf

* Touch up perf

* Remove -Xmx256m

* Extract result printing method

* Print allocation details on OOME

* Single mmap

* Use global allocation arena

* Add commented-out Xmx64m XXMaxDirectMemorySize=1g

* withinSafeZone

* Update cursor earlier

* Better assert

* Fix bug in addrOfSemicolonSafe

* Move declaration lower

* Simplify code

* Add rounding error test case

* Fix DANGER_ZONE_LEN

* Deoptimize parseTemperatureSimple()

* Inline parseTemperatureAndAdvanceCursor()

* Skip masking until the last load

* Conditionally fetch name words

* Cleanup

* Use native image

* Use supbrocess

* Simpler code

* Cleanup

* Avoid extra condition on hot path
---
 calculate_average_mtopolnik.sh                |  10 +-
 prepare_mtopolnik.sh                          |   7 +-
 .../onebrc/CalculateAverage_mtopolnik.java    | 466 ++++++++----------
 3 files changed, 222 insertions(+), 261 deletions(-)

diff --git a/calculate_average_mtopolnik.sh b/calculate_average_mtopolnik.sh
index 24b5a1cb4..acd102459 100755
--- a/calculate_average_mtopolnik.sh
+++ b/calculate_average_mtopolnik.sh
@@ -15,5 +15,11 @@
 #  limitations under the License.
 #
 
-java --enable-preview \
-  --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_mtopolnik
+if [ -f target/CalculateAverage_mtopolnik_image ]; then
+    echo "Using native image 'target/CalculateAverage_mtopolnik_image'" 1>&2
+    target/CalculateAverage_mtopolnik_image
+else
+    JAVA_OPTS="--enable-preview"
+    echo "Native image not found, using JVM mode." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_mtopolnik
+fi
diff --git a/prepare_mtopolnik.sh b/prepare_mtopolnik.sh
index f83a3ff69..d84f20dd8 100755
--- a/prepare_mtopolnik.sh
+++ b/prepare_mtopolnik.sh
@@ -16,4 +16,9 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_mtopolnik_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:+UnlockExperimentalVMOptions -H:-GenLoopSafepoints -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_mtopolnik"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_mtopolnik_image dev.morling.onebrc.CalculateAverage_mtopolnik
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java b/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java
index 51ea41516..61294a4f9 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java
@@ -29,18 +29,15 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import static java.lang.ProcessBuilder.Redirect.PIPE;
+import static java.util.Arrays.asList;
+
 public class CalculateAverage_mtopolnik {
     private static final Unsafe UNSAFE = unsafe();
     private static final int MAX_NAME_LEN = 100;
     private static final int STATS_TABLE_SIZE = 1 << 16;
     private static final int TABLE_INDEX_MASK = STATS_TABLE_SIZE - 1;
     private static final String MEASUREMENTS_TXT = "measurements.txt";
-    private static final byte SEMICOLON = ';';
-    private static final long BROADCAST_SEMICOLON = broadcastByte(SEMICOLON);
-
-    // These two are just informative, I let the IDE calculate them for me
-    private static final long NATIVE_MEM_PER_THREAD = StatsAccessor.SIZEOF * STATS_TABLE_SIZE;
-    private static final long NATIVE_MEM_ON_8_THREADS = 8 * NATIVE_MEM_PER_THREAD;
 
     private static Unsafe unsafe() {
         try {
@@ -53,31 +50,23 @@ private static Unsafe unsafe() {
         }
     }
 
-    static class StationStats implements Comparable<StationStats> {
-        String name;
-        long sum;
-        int count;
-        int min;
-        int max;
-
-        @Override
-        public String toString() {
-            return String.format("%s=%.1f/%.1f/%.1f", name, min / 10.0, Math.round((double) sum / count) / 10.0, max / 10.0);
-        }
-
-        @Override
-        public boolean equals(Object that) {
-            return that.getClass() == StationStats.class && ((StationStats) that).name.equals(this.name);
-        }
-
-        @Override
-        public int compareTo(StationStats that) {
-            return name.compareTo(that.name);
-        }
-    }
-
     public static void main(String[] args) throws Exception {
-        calculate();
+        if (args.length >= 1 && args[0].equals("--worker")) {
+            calculate();
+            System.out.close();
+            return;
+        }
+        var curProcInfo = ProcessHandle.current().info();
+        var cmdLine = new ArrayList<String>();
+        cmdLine.add(curProcInfo.command().get());
+        cmdLine.addAll(asList(curProcInfo.arguments().get()));
+        cmdLine.add("--worker");
+        var process = new ProcessBuilder()
+                .command(cmdLine)
+                .inheritIO().redirectOutput(PIPE)
+                .start()
+                .getInputStream().transferTo(System.out);
+
     }
 
     static void calculate() throws Exception {
@@ -113,7 +102,6 @@ static void calculate() throws Exception {
     }
 
     private static class ChunkProcessor implements Runnable {
-        private static final long NAMEBUF_SIZE = 2 * Long.BYTES;
         private static final int CACHELINE_SIZE = 64;
 
         private final long inputBase;
@@ -122,8 +110,6 @@ private static class ChunkProcessor implements Runnable {
         private final int myIndex;
 
         private StatsAccessor stats;
-        private long nameBufBase;
-        private long cursor;
 
         ChunkProcessor(long chunkStart, long chunkLimit, StationStats[][] results, int myIndex) {
             this.inputBase = chunkStart;
@@ -138,16 +124,12 @@ public void run() {
                 long totalAllocated = 0;
                 String threadName = Thread.currentThread().getName();
                 long statsByteSize = STATS_TABLE_SIZE * StatsAccessor.SIZEOF;
-                var diagnosticString = String.format("Thread %s needs %,d bytes, managed to allocate before OOM: ",
-                        threadName, statsByteSize + NAMEBUF_SIZE);
+                var diagnosticString = String.format("Thread %s needs %,d bytes", threadName, statsByteSize);
                 try {
                     stats = new StatsAccessor(confinedArena.allocate(statsByteSize, CACHELINE_SIZE));
-                    totalAllocated = statsByteSize;
-                    nameBufBase = confinedArena.allocate(NAMEBUF_SIZE).address();
                 }
                 catch (OutOfMemoryError e) {
                     System.err.print(diagnosticString);
-                    System.err.println(totalAllocated);
                     throw e;
                 }
                 processChunk();
@@ -155,227 +137,110 @@ public void run() {
             }
         }
 
-        private static final int MAX_TEMPERATURE_LEN = 5;
-        private static final int MAX_ROW_LEN = MAX_NAME_LEN + 1 + MAX_TEMPERATURE_LEN + 1;
-        private static final long DANGER_ZONE_LENGTH = ((MAX_ROW_LEN - 1) / 8 * 8 + 8);
-
         private void processChunk() {
+            final long inputSize = this.inputSize;
+            final long inputBase = this.inputBase;
+            long cursor = 0;
+            long lastNameWord;
             while (cursor < inputSize) {
-                boolean withinSafeZone;
-                long word1;
-                long word2;
-                long nameLen;
                 long nameStartAddress = inputBase + cursor;
-                if (cursor + DANGER_ZONE_LENGTH <= inputSize) {
-                    withinSafeZone = true;
-                    word1 = UNSAFE.getLong(nameStartAddress);
-                    word2 = UNSAFE.getLong(nameStartAddress + Long.BYTES);
-                    nameLen = nameLen(word1, word2, withinSafeZone);
-                    word1 = maskWord(word1, nameLen);
-                    word2 = maskWord(word2, nameLen - Long.BYTES);
+                long nameWord0 = UNSAFE.getLong(nameStartAddress);
+                long nameWord1 = 0;
+                long matchBits = semicolonMatchBits(nameWord0);
+                long hash;
+                int nameLen;
+                int temperature;
+                if (matchBits != 0) {
+                    nameLen = nameLen(matchBits);
+                    nameWord0 = maskWord(nameWord0, matchBits);
+                    cursor += nameLen;
+                    long tempWord = UNSAFE.getLong(inputBase + cursor);
+                    int dotPos = dotPos(tempWord);
+                    temperature = parseTemperature(tempWord, dotPos);
+                    cursor += (dotPos >> 3) + 3;
+                    hash = hash(nameWord0);
+                    if (stats.gotoName0(hash, nameWord0)) {
+                        stats.observe(temperature);
+                        continue;
+                    }
+                    lastNameWord = nameWord0;
                 }
-                else {
-                    withinSafeZone = false;
-                    UNSAFE.putLong(nameBufBase, 0);
-                    UNSAFE.putLong(nameBufBase + Long.BYTES, 0);
-                    UNSAFE.copyMemory(nameStartAddress, nameBufBase, Long.min(NAMEBUF_SIZE, inputSize - cursor));
-                    word1 = UNSAFE.getLong(nameBufBase);
-                    word2 = UNSAFE.getLong(nameBufBase + Long.BYTES);
-                    nameLen = nameLen(word1, word2, withinSafeZone);
+                else { // nameLen > 8
+                    hash = hash(nameWord0);
+                    nameWord1 = UNSAFE.getLong(nameStartAddress + Long.BYTES);
+                    matchBits = semicolonMatchBits(nameWord1);
+                    if (matchBits != 0) {
+                        nameLen = Long.BYTES + nameLen(matchBits);
+                        nameWord1 = maskWord(nameWord1, matchBits);
+                        cursor += nameLen;
+                        long tempWord = UNSAFE.getLong(inputBase + cursor);
+                        int dotPos = dotPos(tempWord);
+                        temperature = parseTemperature(tempWord, dotPos);
+                        cursor += (dotPos >> 3) + 3;
+                        if (stats.gotoName1(hash, nameWord0, nameWord1)) {
+                            stats.observe(temperature);
+                            continue;
+                        }
+                        lastNameWord = nameWord1;
+                    }
+                    else { // nameLen > 16
+                        nameLen = 2 * Long.BYTES;
+                        while (true) {
+                            lastNameWord = UNSAFE.getLong(nameStartAddress + nameLen);
+                            matchBits = semicolonMatchBits(lastNameWord);
+                            if (matchBits != 0) {
+                                nameLen += nameLen(matchBits);
+                                lastNameWord = maskWord(lastNameWord, matchBits);
+                                cursor += nameLen;
+                                long tempWord = UNSAFE.getLong(inputBase + cursor);
+                                int dotPos = dotPos(tempWord);
+                                temperature = parseTemperature(tempWord, dotPos);
+                                cursor += (dotPos >> 3) + 3;
+                                break;
+                            }
+                            nameLen += Long.BYTES;
+                        }
+                    }
                 }
-                long hash = hash(word1);
-                assert nameLen > 0 && nameLen <= 100 : nameLen;
-                long tempStartAddress = nameStartAddress + nameLen + 1;
-                int temperature = withinSafeZone
-                        ? parseTemperatureSwarAndAdvanceCursor(tempStartAddress)
-                        : parseTemperatureSimpleAndAdvanceCursor(tempStartAddress);
-                updateStats(hash, nameStartAddress, nameLen, word1, word2, temperature, withinSafeZone);
+                stats.gotoAndObserve(hash, nameStartAddress, nameLen, nameWord0, nameWord1, lastNameWord, temperature);
             }
         }
 
-        private void updateStats(
-                                 long hash, long nameStartAddress, long nameLen, long nameWord1, long nameWord2,
-                                 int temperature, boolean withinSafeZone) {
-            int tableIndex = (int) (hash & TABLE_INDEX_MASK);
-            while (true) {
-                stats.gotoIndex(tableIndex);
-                if (stats.hash() == hash && stats.nameLen() == nameLen && nameEquals(
-                        stats.nameAddress(), nameStartAddress, nameLen, nameWord1, nameWord2, withinSafeZone)) {
-                    stats.setSum(stats.sum() + temperature);
-                    stats.setCount(stats.count() + 1);
-                    stats.setMin((short) Integer.min(stats.min(), temperature));
-                    stats.setMax((short) Integer.max(stats.max(), temperature));
-                    return;
-                }
-                if (stats.nameLen() != 0) {
-                    tableIndex = (tableIndex + 1) & TABLE_INDEX_MASK;
-                    continue;
-                }
-                stats.setHash(hash);
-                stats.setNameLen((int) nameLen);
-                stats.setSum(temperature);
-                stats.setCount(1);
-                stats.setMin((short) temperature);
-                stats.setMax((short) temperature);
-                UNSAFE.copyMemory(nameStartAddress, stats.nameAddress(), nameLen);
-                return;
-            }
-        }
+        private static final long BROADCAST_SEMICOLON = 0x3B3B3B3B3B3B3B3BL;
+        private static final long BROADCAST_0x01 = 0x0101010101010101L;
+        private static final long BROADCAST_0x80 = 0x8080808080808080L;
 
-        // Credit: merykitty
-        private int parseTemperatureSwarAndAdvanceCursor(long tempStartAddress) {
-            long word = UNSAFE.getLong(tempStartAddress);
-            final long negated = ~word;
-            final int dotPos = Long.numberOfTrailingZeros(negated & 0x10101000);
-            cursor = (tempStartAddress + (dotPos / 8) + 3) - inputBase;
-            final long signed = (negated << 59) >> 63;
-            final long removeSignMask = ~(signed & 0xFF);
-            final long digits = ((word & removeSignMask) << (28 - dotPos)) & 0x0F000F0F00L;
-            final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-            return (int) ((absValue ^ signed) - signed);
+        private static long semicolonMatchBits(long word) {
+            long diff = word ^ BROADCAST_SEMICOLON;
+            return (diff - BROADCAST_0x01) & (~diff & BROADCAST_0x80);
         }
 
-        private int parseTemperatureSimpleAndAdvanceCursor(long tempStartAddress) {
-            final byte minus = (byte) '-';
-            final byte zero = (byte) '0';
-            final byte dot = (byte) '.';
-
-            byte ch = UNSAFE.getByte(tempStartAddress);
-            long address = tempStartAddress;
-            int temperature;
-            int sign;
-            if (ch == minus) {
-                sign = -1;
-                address++;
-                ch = UNSAFE.getByte(address);
-            }
-            else {
-                sign = 1;
-            }
-            temperature = ch - zero;
-            address++;
-            ch = UNSAFE.getByte(address);
-            if (ch == dot) {
-                address++;
-                ch = UNSAFE.getByte(address);
-            }
-            else {
-                temperature = 10 * temperature + (ch - zero);
-                address += 2;
-                ch = UNSAFE.getByte(address);
-            }
-            temperature = 10 * temperature + (ch - zero);
-            // address - inputBase is the length of the temperature field.
-            // A newline character follows the temperature, and so we advance
-            // the cursor past the newline to the start of the next line.
-            cursor = (address + 2) - inputBase;
-            return sign * temperature;
-        }
-
-        private static long hash(long word1) {
-            long seed = 0x51_7c_c1_b7_27_22_0a_95L;
-            int rotDist = 17;
-
-            long hash = word1;
-            hash *= seed;
-            hash = Long.rotateLeft(hash, rotDist);
-            // hash ^= word2;
-            // hash *= seed;
-            // hash = Long.rotateLeft(hash, rotDist);
-            return hash;
-        }
-
-        private static boolean nameEquals(long statsAddr, long inputAddr, long len, long inputWord1, long inputWord2,
-                                          boolean withinSafeZone) {
-            boolean mismatch1 = maskWord(inputWord1, len) != UNSAFE.getLong(statsAddr);
-            boolean mismatch2 = maskWord(inputWord2, len - Long.BYTES) != UNSAFE.getLong(statsAddr + Long.BYTES);
-            if (len <= 2 * Long.BYTES) {
-                return !(mismatch1 | mismatch2);
-            }
-            if (withinSafeZone) {
-                int i = 2 * Long.BYTES;
-                for (; i <= len - Long.BYTES; i += Long.BYTES) {
-                    if (UNSAFE.getLong(inputAddr + i) != UNSAFE.getLong(statsAddr + i)) {
-                        return false;
-                    }
-                }
-                return maskWord(UNSAFE.getLong(inputAddr + i), len - i) == UNSAFE.getLong(statsAddr + i);
-            }
-            else {
-                for (int i = 2 * Long.BYTES; i < len; i++) {
-                    if (UNSAFE.getByte(inputAddr + i) != UNSAFE.getByte(statsAddr + i)) {
-                        return false;
-                    }
-                }
-            }
-            return true;
+        // credit: artsiomkorzun
+        private static long maskWord(long word, long matchBits) {
+            long mask = matchBits ^ (matchBits - 1);
+            return word & mask;
         }
 
-        private static long maskWord(long word, long len) {
-            long halfShiftDistance = Long.max(0, Long.BYTES - len) << 2;
-            long mask = (~0L >>> halfShiftDistance) >>> halfShiftDistance; // avoid Java trap of shiftDist % 64
-            return word & mask;
+        // credit: merykitty
+        private static int dotPos(long word) {
+            return Long.numberOfTrailingZeros(~word & 0x10101000);
         }
 
-        private static final long BROADCAST_0x01 = broadcastByte(0x01);
-        private static final long BROADCAST_0x80 = broadcastByte(0x80);
-
-        // Adapted from https://jameshfisher.com/2017/01/24/bitwise-check-for-zero-byte/
-        // and https://github.com/ashvardanian/StringZilla/blob/14e7a78edcc16b031c06b375aac1f66d8f19d45a/stringzilla/stringzilla.h#L139-L169
-        long nameLen(long word1, long word2, boolean withinSafeZone) {
-            {
-                long matchBits1 = matchBits(word1);
-                long matchBits2 = matchBits(word2);
-                if ((matchBits1 | matchBits2) != 0) {
-                    int trailing1 = Long.numberOfTrailingZeros(matchBits1);
-                    int match1IsNonZero = trailing1 & 63;
-                    match1IsNonZero |= match1IsNonZero >>> 3;
-                    match1IsNonZero |= match1IsNonZero >>> 1;
-                    match1IsNonZero |= match1IsNonZero >>> 1;
-                    // Now match1IsNonZero is 1 if it's non-zero, else 0. Use it to
-                    // raise the lowest bit in trailing2 if trailing1 is nonzero. This forces
-                    // trailing2 to be zero if trailing1 is non-zero.
-                    int trailing2 = Long.numberOfTrailingZeros(matchBits2 | match1IsNonZero) & 63;
-                    // trailing1 | trailing2 works like trailing1 + trailing2 because if trailing2 is non-zero,
-                    // then trailing1 is 64, and since trailing2 is < 64, there's no bit overlap.
-                    return (trailing1 | trailing2) >> 3;
-                }
-            }
-            long nameStartAddress = inputBase + cursor;
-            long address = nameStartAddress + 2 * Long.BYTES;
-            long limit = inputBase + inputSize;
-            if (withinSafeZone) {
-                for (; address < limit; address += Long.BYTES) {
-                    var block = maskWord(UNSAFE.getLong(address), limit - address);
-                    long matchBits = matchBits(block);
-                    if (matchBits != 0) {
-                        return address + (Long.numberOfTrailingZeros(matchBits) >> 3) - nameStartAddress;
-                    }
-                }
-                throw new RuntimeException("Semicolon not found");
-            }
-            return addrOfSemicolonSafe(address, limit) - nameStartAddress;
+        // credit: merykitty
+        private static int parseTemperature(long word, int dotPos) {
+            final long signed = (~word << 59) >> 63;
+            final long removeSignMask = ~(signed & 0xFF);
+            final long digits = ((word & removeSignMask) << (28 - dotPos)) & 0x0F000F0F00L;
+            final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            return (int) ((absValue ^ signed) - signed);
         }
 
-        private static long addrOfSemicolonSafe(long address, long limit) {
-            for (; address < limit - Long.BYTES + 1; address += Long.BYTES) {
-                var block = UNSAFE.getLong(address);
-                long matchBits = matchBits(block);
-                if (matchBits != 0) {
-                    return address + (Long.numberOfTrailingZeros(matchBits) >> 3);
-                }
-            }
-            for (; address < limit; address++) {
-                if (UNSAFE.getByte(address) == SEMICOLON) {
-                    return address;
-                }
-            }
-            throw new RuntimeException("Semicolon not found");
+        private static int nameLen(long separator) {
+            return (Long.numberOfTrailingZeros(separator) >>> 3) + 1;
         }
 
-        private static long matchBits(long word) {
-            long diff = word ^ BROADCAST_SEMICOLON;
-            return (diff - BROADCAST_0x01) & ~diff & BROADCAST_0x80;
+        private static long hash(long word) {
+            return Long.rotateLeft(word * 0x51_7c_c1_b7_27_22_0a_95L, 17);
         }
 
         // Copies the results from native memory to Java heap and puts them into the results array.
@@ -403,22 +268,6 @@ private void exportResults() {
             Arrays.sort(exported);
             results[myIndex] = exported;
         }
-
-        private final ByteBuffer buf = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder());
-
-        private String longToString(long word) {
-            buf.clear();
-            buf.putLong(word);
-            return new String(buf.array(), StandardCharsets.UTF_8); // + "|" + Arrays.toString(buf.array());
-        }
-    }
-
-    private static long broadcastByte(int b) {
-        long nnnnnnnn = b;
-        nnnnnnnn |= nnnnnnnn << 8;
-        nnnnnnnn |= nnnnnnnn << 16;
-        nnnnnnnn |= nnnnnnnn << 32;
-        return nnnnnnnn;
     }
 
     static class StatsAccessor {
@@ -446,6 +295,16 @@ void gotoIndex(int index) {
             slotBase = address + index * SIZEOF;
         }
 
+        private boolean gotoName0(long hash, long nameWord0) {
+            gotoIndex((int) (hash & TABLE_INDEX_MASK));
+            return hash() == hash && nameWord0() == nameWord0;
+        }
+
+        private boolean gotoName1(long hash, long nameWord0, long nameWord1) {
+            gotoIndex((int) (hash & TABLE_INDEX_MASK));
+            return hash() == hash && nameWord0() == nameWord0 && nameWord1() == nameWord1;
+        }
+
         long hash() {
             return UNSAFE.getLong(slotBase + HASH_OFFSET);
         }
@@ -474,9 +333,17 @@ long nameAddress() {
             return slotBase + NAME_OFFSET;
         }
 
+        long nameWord0() {
+            return UNSAFE.getLong(nameAddress());
+        }
+
+        long nameWord1() {
+            return UNSAFE.getLong(nameAddress() + Long.BYTES);
+        }
+
         String exportNameString() {
-            final var bytes = new byte[nameLen()];
-            UNSAFE.copyMemory(null, nameAddress(), bytes, ARRAY_BASE_OFFSET, nameLen());
+            final var bytes = new byte[nameLen() - 1];
+            UNSAFE.copyMemory(null, nameAddress(), bytes, ARRAY_BASE_OFFSET, bytes.length);
             return new String(bytes, StandardCharsets.UTF_8);
         }
 
@@ -503,6 +370,59 @@ void setMin(short min) {
         void setMax(short max) {
             UNSAFE.putShort(slotBase + MAX_OFFSET, max);
         }
+
+        void gotoAndObserve(
+                            long hash, long nameStartAddress, int nameLen, long nameWord0, long nameWord1, long lastNameWord,
+                            int temperature) {
+            int tableIndex = (int) (hash & TABLE_INDEX_MASK);
+            while (true) {
+                gotoIndex(tableIndex);
+                if (hash() == hash && nameLen() == nameLen && nameEquals(
+                        nameAddress(), nameStartAddress, nameLen, nameWord0, nameWord1, lastNameWord)) {
+                    observe(temperature);
+                    break;
+                }
+                if (nameLen() != 0) {
+                    tableIndex = (tableIndex + 1) & TABLE_INDEX_MASK;
+                    continue;
+                }
+                initialize(hash, nameLen, nameStartAddress, temperature);
+                break;
+            }
+        }
+
+        void initialize(long hash, long nameLen, long nameStartAddress, int temperature) {
+            setHash(hash);
+            setNameLen((int) nameLen);
+            setSum(temperature);
+            setCount(1);
+            setMin((short) temperature);
+            setMax((short) temperature);
+            UNSAFE.copyMemory(nameStartAddress, nameAddress(), nameLen);
+        }
+
+        void observe(int temperature) {
+            setSum(sum() + temperature);
+            setCount(count() + 1);
+            setMin((short) Integer.min(min(), temperature));
+            setMax((short) Integer.max(max(), temperature));
+        }
+
+        private static boolean nameEquals(
+                                          long statsAddr, long inputAddr, long len, long inputWord1, long inputWord2, long lastInputWord) {
+            boolean mismatch1 = inputWord1 != UNSAFE.getLong(statsAddr);
+            boolean mismatch2 = inputWord2 != UNSAFE.getLong(statsAddr + Long.BYTES);
+            if (len <= 2 * Long.BYTES) {
+                return !(mismatch1 | mismatch2);
+            }
+            int i = 2 * Long.BYTES;
+            for (; i <= len - Long.BYTES; i += Long.BYTES) {
+                if (UNSAFE.getLong(inputAddr + i) != UNSAFE.getLong(statsAddr + i)) {
+                    return false;
+                }
+            }
+            return i == len || lastInputWord == UNSAFE.getLong(statsAddr + i);
+        }
     }
 
     private static void mergeSortAndPrint(StationStats[][] results) {
@@ -556,4 +476,34 @@ else if (min.equals(curr)) {
         }
         System.out.println('}');
     }
+
+    static class StationStats implements Comparable<StationStats> {
+        String name;
+        long sum;
+        int count;
+        int min;
+        int max;
+
+        @Override
+        public String toString() {
+            return String.format("%s=%.1f/%.1f/%.1f", name, min / 10.0, Math.round((double) sum / count) / 10.0, max / 10.0);
+        }
+
+        @Override
+        public boolean equals(Object that) {
+            return that.getClass() == StationStats.class && ((StationStats) that).name.equals(this.name);
+        }
+
+        @Override
+        public int compareTo(StationStats that) {
+            return name.compareTo(that.name);
+        }
+    }
+
+    private static String longToString(long word) {
+        final ByteBuffer buf = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder());
+        buf.clear();
+        buf.putLong(word);
+        return new String(buf.array(), StandardCharsets.UTF_8);
+    }
 }

From a82cf2ceb7f3832a953f1a68162b03128765612b Mon Sep 17 00:00:00 2001
From: Yavuz Tas <12643010+yavuztas@users.noreply.github.com>
Date: Mon, 29 Jan 2024 21:02:20 +0100
Subject: [PATCH 194/268] improve speed, thanks to the following improvements:
 (#550)

* improve speed, thanks to the following improvements:
- loop unrolling and eleminating extra calculations
- eleminating instance level variable access
- quicker equals check, checking long by long chunks instead of bytes
- update GraalVM version to the latest

* faster equals check

* fix equals bug in 10K, more optimizations on equals and calculate hash parts

* New solution optimized for Linux/AMD hardware

* Optimize solution, try to fix 10K bug on native

* Optimize solution, move records to a local field

* test timing

* revert back accidentally pushed code

---------

Co-authored-by: Yavuz Tas <yavuz.tas@ing.com>
---
 calculate_average_yavuztas.sh                 |  10 +-
 prepare_yavuztas.sh                           |   7 +-
 .../onebrc/CalculateAverage_yavuztas.java     | 507 +++++++++++-------
 3 files changed, 333 insertions(+), 191 deletions(-)

diff --git a/calculate_average_yavuztas.sh b/calculate_average_yavuztas.sh
index bfa7b1090..bbcd403e0 100755
--- a/calculate_average_yavuztas.sh
+++ b/calculate_average_yavuztas.sh
@@ -15,5 +15,11 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="-Xms128m -Xmx128m -XX:MaxGCPauseMillis=1 -XX:-AlwaysPreTouch -XX:+UseSerialGC --enable-preview"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_yavuztas
+if [ -f target/CalculateAverage_yavuztas_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_yavuztas_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_yavuztas_image
+else
+    JAVA_OPTS="-XX:MaxGCPauseMillis=1 -XX:-AlwaysPreTouch -XX:+UseSerialGC -XX:+TieredCompilation --enable-preview"
+    echo "Choosing to run the app in JVM mode as no native image was found, use prepare_yavuztas.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_yavuztas
+fi
diff --git a/prepare_yavuztas.sh b/prepare_yavuztas.sh
index f83a3ff69..f9871afd7 100755
--- a/prepare_yavuztas.sh
+++ b/prepare_yavuztas.sh
@@ -16,4 +16,9 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_yavuztas_image ]; then
+    NATIVE_IMAGE_OPTS="--initialize-at-build-time=dev.morling.onebrc.CalculateAverage_yavuztas --gc=epsilon -O3 -march=native -R:MaxHeapSize=128m -H:-GenLoopSafepoints --enable-preview"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_yavuztas_image dev.morling.onebrc.CalculateAverage_yavuztas
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java b/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java
index e33fe7e92..0e589a4b7 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java
@@ -17,15 +17,16 @@
 
 import sun.misc.Unsafe;
 
-import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
-import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.TreeMap;
 import java.util.function.Consumer;
 
@@ -35,8 +36,9 @@ public class CalculateAverage_yavuztas {
 
     private static final Unsafe UNSAFE = unsafe();
 
-    // Tried all there: MappedByteBuffer, MemorySegment and Unsafe
-    // Accessing the memory using Unsafe is still the fastest in my experience
+    // I compared all three: MappedByteBuffer, MemorySegment and Unsafe.
+    // Accessing the memory using Unsafe is still the fastest in my experience.
+    // However, I would never use it in production, single programming error will crash your app.
     private static Unsafe unsafe() {
         try {
             final Field f = Unsafe.class.getDeclaredField("theUnsafe");
@@ -48,296 +50,419 @@ private static Unsafe unsafe() {
         }
     }
 
-    // Only one object, both for measurements and keys, less object creation in hotpots is always faster
-    static class Record {
-
-        // keep memory starting address for each segment
-        // since we use Unsafe, this is enough to align and fetch the data
-        long segment;
-        int start;
-        int length;
-        int hash;
+    /**
+     * Extract bytes from a long
+     */
+    private static long partial(long word, int length) {
+        final long mask = (~0L) << (length << 3);
+        return word & (~mask);
+    }
 
-        private int min = 1000; // calculations over int is faster than double, we convert to double in the end only once
-        private int max = -1000;
+    // Only one object, both for measurements and keys, less object creation in hotpots is always faster
+    private static final class Record {
+
+        private final long start; // memory address of the underlying data
+        private final int length;
+        private final long word1;
+        private final long word2;
+        private final long wordLast;
+        private final int hash;
+        private Record next; // linked list to resolve hash collisions
+
+        private int min; // calculations over int is faster than double, we convert to double in the end only once
+        private int max;
         private long sum;
-        private long count;
+        private int count;
 
-        public Record(long segment, int start, int length, int hash) {
-            this.segment = segment;
+        public Record(long start, int length, long word1, long word2, long wordLast, int hash, int temp) {
             this.start = start;
             this.length = length;
+            this.word1 = word1;
+            this.word2 = word2;
+            this.wordLast = wordLast;
             this.hash = hash;
+            this.min = temp;
+            this.max = temp;
+            this.sum = temp;
+            this.count = 1;
         }
 
         @Override
         public boolean equals(Object o) {
             final Record record = (Record) o;
-            return equals(record.segment, record.start, record.length, record.hash);
+            return equals(record.start, record.word1, record.word2, record.wordLast, record.length);
         }
 
-        /**
-         * Stateless equals, no Record object needed
-         */
-        public boolean equals(long segment, int start, int length, int hash) {
-            if (this.length != length || this.hash != hash)
-                return false;
+        private static boolean notEquals(long address1, long address2, int step) {
+            return UNSAFE.getLong(address1 + step) != UNSAFE.getLong(address2 + step);
+        }
 
-            int i = 0; // bytes mismatch check
-            while (i < this.length
-                    && UNSAFE.getByte(this.segment + this.start + i) == UNSAFE.getByte(segment + start + i)) {
-                i++;
+        private static boolean equalsComparingLongs(long start1, long start2, int length) {
+            // first shortcuts
+            if (length < 24)
+                return true;
+            if (length < 32)
+                return !notEquals(start1, start2, 16);
+
+            int step = 24; // starting from 3rd long
+            length -= step;
+            while (length >= 8) { // scan longs
+                if (notEquals(start1, start2, step)) {
+                    return false;
+                }
+                length -= 8;
+                step += 8; // 8 bytes
             }
-            return i == this.length;
+            return true;
         }
 
-        @Override
-        public int hashCode() {
-            return this.hash;
+        private boolean equals(long start, long word1, long word2, long last, int length) {
+            if (this.word1 != word1)
+                return false;
+            if (this.word2 != word2)
+                return false;
+
+            // equals check is done by comparing longs instead of byte by byte check, this is faster
+            return equalsComparingLongs(this.start, start, length) && this.wordLast == last;
         }
 
         @Override
         public String toString() {
             final byte[] bytes = new byte[this.length];
-            int i = 0;
-            while (i < this.length) {
-                bytes[i] = UNSAFE.getByte(this.segment + this.start + i++);
-            }
-
+            UNSAFE.copyMemory(null, this.start, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, this.length);
             return new String(bytes, StandardCharsets.UTF_8);
         }
 
-        public Record collect(int temp) {
-            this.min = Math.min(this.min, temp);
-            this.max = Math.max(this.max, temp);
+        private void collect(int temp) {
+            if (temp < this.min)
+                this.min = temp;
+            if (temp > this.max)
+                this.max = temp;
             this.sum += temp;
             this.count++;
-            return this;
         }
 
-        public void merge(Record other) {
-            this.min = Math.min(this.min, other.min);
-            this.max = Math.max(this.max, other.max);
-            this.sum += other.sum;
-            this.count += other.count;
+        private void merge(Record that) {
+            if (that.min < this.min)
+                this.min = that.min;
+            if (that.max > this.max)
+                this.max = that.max;
+            this.sum += that.sum;
+            this.count += that.count;
         }
 
-        public String measurements() {
+        private String measurements() {
             // here is only executed once for each unique key, so StringBuilder creation doesn't harm
             final StringBuilder sb = new StringBuilder(14);
-            sb.append(this.min / 10.0);
-            sb.append("/");
-            sb.append(round((this.sum / 10.0) / this.count));
-            sb.append("/");
-            sb.append(this.max / 10.0);
+            sb.append(round(this.min)).append("/");
+            sb.append(round(1.0 * this.sum / this.count)).append("/");
+            sb.append(round(this.max));
             return sb.toString();
         }
     }
 
     // Inspired by @spullara - customized hashmap on purpose
-    // The main difference is we hold only one array instead of two
-    static class RecordMap {
+    // The main difference is we hold only one array instead of two, fewer objects is faster
+    private static final class RecordMap {
 
-        static final int SIZE = 1 << 15; // 32k - bigger bucket size less collisions
-        static final int BITMASK = SIZE - 1;
-        Record[] keys = new Record[SIZE];
+        // Bigger bucket size less collisions, but you have to find a sweet spot otherwise it is becoming slower.
+        // Also works good enough for 10K stations
+        private static final int SIZE = 1 << 14; // 16kb - enough for 10K
+        private static final int BITMASK = SIZE - 1;
+        private final Record[] keys = new Record[SIZE];
 
-        static int hashBucket(int hash) {
+        // int collision;
+
+        private boolean hasNoRecord(int index) {
+            return this.keys[index] == null;
+        }
+
+        private Record getRecord(int index) {
+            return this.keys[index];
+        }
+
+        private static int hashBucket(int hash) {
             hash = hash ^ (hash >>> 16); // naive bit spreading but surprisingly decreases collision :)
             return hash & BITMASK; // fast modulo, to find bucket
         }
 
-        void putAndCollect(long segment, int start, int length, int hash, int temp) {
-            int bucket = hashBucket(hash);
-            Record existing = this.keys[bucket];
-            if (existing == null) {
-                this.keys[bucket] = new Record(segment, start, length, hash)
-                        .collect(temp);
+        private void putAndCollect(int hash, int temp, long start, int length, long word1, long word2, long wordLast) {
+            final int bucket = hashBucket(hash);
+            if (hasNoRecord(bucket)) {
+                this.keys[bucket] = new Record(start, length, word1, word2, wordLast, hash, temp);
                 return;
             }
 
-            if (!existing.equals(segment, start, length, hash)) {
-                // collision, linear probing to find a slot
-                while ((existing = this.keys[++bucket & BITMASK]) != null && !existing.equals(segment, start, length, hash)) {
-                    // can be stuck here if all the buckets are full :(
-                    // However, since the data set is max 10K (unique) this shouldn't happen
-                    // So, I'm happily leave here branchless :)
-                }
-                if (existing == null) {
-                    this.keys[bucket & BITMASK] = new Record(segment, start, length, hash)
-                            .collect(temp);
-                    return;
-                }
+            Record existing = getRecord(bucket);
+            if (existing.equals(start, word1, word2, wordLast, length)) {
                 existing.collect(temp);
+                return;
             }
-            else {
-                existing.collect(temp);
+
+            // collision++;
+            // find possible slot by scanning the slot linked list
+            while (existing.next != null) {
+                if (existing.next.equals(start, word1, word2, wordLast, length)) {
+                    existing.next.collect(temp);
+                    return;
+                }
+                existing = existing.next; // go on to next
+                // collision++;
             }
+            existing.next = new Record(start, length, word1, word2, wordLast, hash, temp);
         }
 
-        void putOrMerge(Record key) {
-            int bucket = hashBucket(key.hash);
-            Record existing = this.keys[bucket];
-            if (existing == null) {
+        private void putOrMerge(Record key) {
+            final int bucket = hashBucket(key.hash);
+            if (hasNoRecord(bucket)) {
+                key.next = null;
                 this.keys[bucket] = key;
                 return;
             }
 
-            if (!existing.equals(key)) {
-                // collision, linear probing to find a slot
-                while ((existing = this.keys[++bucket & BITMASK]) != null && !existing.equals(key)) {
-                    // can be stuck here if all the buckets are full :(
-                    // However, since the data set is max 10K (unique keys) this shouldn't happen
-                    // So, I'm happily leave here branchless :)
-                }
-                if (existing == null) {
-                    this.keys[bucket & BITMASK] = key;
-                    return;
-                }
+            Record existing = getRecord(bucket);
+            if (existing.equals(key)) {
                 existing.merge(key);
+                return;
             }
-            else {
-                existing.merge(key);
+
+            // collision++;
+            // find possible slot by scanning the slot linked list
+            while (existing.next != null) {
+                if (existing.next.equals(key)) {
+                    existing.next.merge(key);
+                    return;
+                }
+                existing = existing.next; // go on to next
+                // collision++;
             }
+            key.next = null;
+            existing.next = key;
         }
 
-        void forEach(Consumer<Record> consumer) {
+        private void forEach(Consumer<Record> consumer) {
             int pos = 0;
             Record key;
-            while (pos < this.keys.length) {
+            while (pos < SIZE) {
                 if ((key = this.keys[pos++]) == null) {
                     continue;
                 }
+                Record next = key.next;
                 consumer.accept(key);
+                while (next != null) { // also traverse the records in the collision list
+                    final Record tmp = next.next;
+                    consumer.accept(next);
+                    next = tmp;
+                }
             }
         }
 
-        void merge(RecordMap other) {
+        private void merge(RecordMap other) {
             other.forEach(this::putOrMerge);
         }
 
     }
 
     // One actor for one thread, no synchronization
-    static class RegionActor {
-
-        final FileChannel channel;
-        final long startPos;
-        final int size;
-        final RecordMap map = new RecordMap();
-        long segmentAddress;
-        int position;
-        Thread runner; // each actor has its own thread
-
-        public RegionActor(FileChannel channel, long startPos, int size) {
-            this.channel = channel;
+    private static final class RegionActor extends Thread {
+
+        private final long startPos; // start of region memory address
+        private final int size;
+
+        private final RecordMap map = new RecordMap();
+
+        public RegionActor(long startPos, int size) {
             this.startPos = startPos;
             this.size = size;
         }
 
-        void accumulate() {
-            this.runner = new Thread(() -> {
-                try {
-                    // get the segment memory address, this is the only thing we need for Unsafe
-                    this.segmentAddress = this.channel.map(FileChannel.MapMode.READ_ONLY, this.startPos, this.size, Arena.global()).address();
-                }
-                catch (IOException e) {
-                    // no-op - skip intentionally, no handling for the purpose of this challenge
-                }
-
-                int start;
-                int keyHash;
-                int length;
-                while (this.position < this.size) {
-                    byte b;
-                    start = this.position; // save line start position
-                    keyHash = UNSAFE.getByte(this.segmentAddress + this.position++); // first byte is guaranteed not to be ';'
-                    length = 1; // min key length
-                    while ((b = UNSAFE.getByte(this.segmentAddress + this.position++)) != ';') { // read until semicolon
-                        keyHash = calculateHash(keyHash, b); // calculate key hash ahead, eleminates one more loop later
-                        length++;
-                    }
+        private static long getWord(long address) {
+            return UNSAFE.getLong(address);
+        }
 
-                    final int temp = readTemperature();
-                    this.map.putAndCollect(this.segmentAddress, start, length, keyHash, temp);
+        // hasvalue & haszero
+        // adapted from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+        private static long hasSemicolon(long word) {
+            // semicolon pattern
+            final long hasVal = word ^ 0x3B3B3B3B3B3B3B3BL; // hasvalue
+            return ((hasVal - 0x0101010101010101L) & ~hasVal & 0x8080808080808080L); // haszero
+        }
 
-                    this.position++; // skip linebreak
-                }
-            });
-            this.runner.start();
+        private static int semicolonPos(long hasVal) {
+            return Long.numberOfTrailingZeros(hasVal) >>> 3;
         }
 
-        static int calculateHash(int hash, int b) {
-            return 31 * hash + b;
+        private static int decimalPos(long numberWord) {
+            return Long.numberOfTrailingZeros(~numberWord & 0x10101000);
         }
 
-        // 1. Inspired by @yemreinci - Reading temparature value without Double.parse
-        // 2. Inspired by @obourgain - Fetching first 4 bytes ahead, then masking
-        int readTemperature() {
-            int temp = 0;
-            // read 4 bytes ahead
-            final int first4 = UNSAFE.getInt(this.segmentAddress + this.position);
-            this.position += 3;
+        private static final int MAX_INNER_LOOP_SIZE = 11;
 
-            final byte b1 = (byte) first4; // first byte
-            final byte b2 = (byte) ((first4 >> 8) & 0xFF); // second byte
-            final byte b3 = (byte) ((first4 >> 16) & 0xFF); // third byte
-            if (b1 == '-') {
-                if (b3 == '.') {
-                    temp -= 10 * (b2 - '0') + (byte) ((first4 >> 24) & 0xFF) - '0'; // fourth byte
-                    this.position++;
-                }
-                else {
-                    this.position++; // skip dot
-                    temp -= 100 * (b2 - '0') + 10 * (b3 - '0') + UNSAFE.getByte(this.segmentAddress + this.position++) - '0'; // fifth byte
-                }
-            }
-            else {
-                if (b2 == '.') {
-                    temp = 10 * (b1 - '0') + b3 - '0';
+        @Override
+        public void run() {
+            long pointer = this.startPos;
+            final long size = pointer + this.size;
+            while (pointer < size) { // line start
+                long hash = 0; // reset hash
+                long s; // semicolon check word
+                final int pos; // semicolon position
+                long word1 = getWord(pointer);
+                if ((s = hasSemicolon(word1)) != 0) {
+                    pos = semicolonPos(s);
+                    // read temparature
+                    final long numberWord = getWord(pointer + pos + 1);
+                    final int decimalPos = decimalPos(numberWord);
+                    final int temp = convertIntoNumber(decimalPos, numberWord);
+
+                    word1 = partial(word1, pos); // last word
+                    this.map.putAndCollect(completeHash(hash, word1), temp, pointer, pos, word1, 0, 0);
+
+                    pointer += pos + (decimalPos >>> 3) + 4;
                 }
                 else {
-                    temp = 100 * (b1 - '0') + 10 * (b2 - '0') + (byte) ((first4 >> 24) & 0xFF) - '0'; // fourth byte
-                    this.position++;
+                    long word2 = getWord(pointer + 8);
+                    if ((s = hasSemicolon(word2)) != 0) {
+                        pos = semicolonPos(s);
+                        // read temparature
+                        final int length = pos + 8;
+                        final long numberWord = getWord(pointer + length + 1);
+                        final int decimalPos = decimalPos(numberWord);
+                        final int temp = convertIntoNumber(decimalPos, numberWord);
+
+                        word2 = partial(word2, pos); // last word
+                        this.map.putAndCollect(completeHash(hash, word1, word2), temp, pointer, length, word1, word2, 0);
+
+                        pointer += length + (decimalPos >>> 3) + 4; // seek to the line end
+                    }
+                    else {
+                        long word = 0;
+                        int length = 16;
+                        hash = appendHash(hash, word1, word2);
+                        // Let the compiler know the loop size ahead
+                        // Then it's automatically unrolled
+                        // Max key length is 13 longs, 2 we've read before, 11 left
+                        for (int i = 0; i < MAX_INNER_LOOP_SIZE; i++) {
+                            if ((s = hasSemicolon((word = getWord(pointer + length)))) != 0) {
+                                break;
+                            }
+                            hash = appendHash(hash, word);
+                            length += 8;
+                        }
+
+                        pos = semicolonPos(s);
+                        length += pos;
+                        // read temparature
+                        final long numberWord = getWord(pointer + length + 1);
+                        final int decimalPos = decimalPos(numberWord);
+                        final int temp = convertIntoNumber(decimalPos, numberWord);
+
+                        word = partial(word, pos); // last word
+                        this.map.putAndCollect(completeHash(hash, word), temp, pointer, length, word1, word2, word);
+
+                        pointer += length + (decimalPos >>> 3) + 4; // seek to the line end
+                    }
                 }
             }
+        }
+
+        // Hashes are calculated by a Mersenne Prime (1 << 7) -1
+        // This is faster than multiplication in some machines
+        private static long appendHash(long hash, long word) {
+            return (hash << 7) - hash + word;
+        }
+
+        private static long appendHash(long hash, long word1, long word2) {
+            hash = (hash << 7) - hash + word1;
+            return (hash << 7) - hash + word2;
+        }
+
+        private static int completeHash(long hash, long partial) {
+            hash = (hash << 7) - hash + partial;
+            return (int) (hash ^ (hash >>> 25));
+        }
+
+        private static int completeHash(long hash, long word1, long word2) {
+            hash = (hash << 7) - hash + word1;
+            hash = (hash << 7) - hash + word2;
+            return (int) hash ^ (int) (hash >>> 25);
+        }
 
-            return temp;
+        // Credits to @merrykitty. Magical solution to parse temparature values branchless!
+        // Taken as without modification, comments belong to @merrykitty
+        private static int convertIntoNumber(int decimalSepPos, long numberWord) {
+            final int shift = 28 - decimalSepPos;
+            // signed is -1 if negative, 0 otherwise
+            final long signed = (~numberWord << 59) >> 63;
+            final long designMask = ~(signed & 0xFF);
+            // Align the number to a specific position and transform the ascii code
+            // to actual digit value in each byte
+            final long digits = ((numberWord & designMask) << shift) & 0x0F000F0F00L;
+            // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+            // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+            // 0x000000UU00TTHH00 +
+            // 0x00UU00TTHH000000 * 10 +
+            // 0xUU00TTHH00000000 * 100
+            // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
+            // This results in our value lies in the bit 32 to 41 of this product
+            // That was close :)
+            final long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            final long value = (absValue ^ signed) - signed;
+            return (int) value;
         }
 
         /**
          * blocks until the map is fully collected
          */
-        RecordMap get() throws InterruptedException {
-            this.runner.join();
+        private RecordMap get() throws InterruptedException {
+            join();
             return this.map;
         }
     }
 
     private static double round(double value) {
-        return Math.round(value * 10.0) / 10.0;
+        return Math.round(value) / 10.0;
     }
 
     /**
      * Scans the given buffer to the left
      */
-    private static long findClosestLineEnd(long start, int size, FileChannel channel) throws IOException {
-        final long position = start + size;
-        final long left = Math.max(position - 101, 0);
-        final ByteBuffer buffer = ByteBuffer.allocate(101); // enough size to find at least one '\n'
-        if (channel.read(buffer.clear(), left) != -1) {
-            int bufferPos = buffer.position() - 1;
-            while (buffer.get(bufferPos) != '\n') {
-                bufferPos--;
-                size--;
-            }
+    private static long findClosestLineEnd(long start, int size) {
+        long position = start + size;
+        while (UNSAFE.getByte(--position) != '\n') {
+            // read until a linebreak
+            size--;
         }
         return size;
     }
 
-    public static void main(String[] args) throws IOException, InterruptedException {
+    private static boolean isWorkerProcess(String[] args) {
+        return Arrays.asList(args).contains("--worker");
+    }
+
+    private static void runAsWorker() throws Exception {
+        final ProcessHandle.Info info = ProcessHandle.current().info();
+        final List<String> commands = new ArrayList<>();
+        info.command().ifPresent(commands::add);
+        info.arguments().ifPresent(args -> commands.addAll(Arrays.asList(args)));
+        commands.add("--worker");
+
+        new ProcessBuilder()
+                .command(commands)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
+    public static void main(String[] args) throws Exception {
 
-        var concurrency = Runtime.getRuntime().availableProcessors();
+        // Dased on @thomaswue's idea, to cut unmapping delay.
+        // Strangely, unmapping delay doesn't occur on macOS/M1 however in Linux/AMD it's substantial - ~200ms
+        if (!isWorkerProcess(args)) {
+            runAsWorker();
+            return;
+        }
+
+        var concurrency = 2 * Runtime.getRuntime().availableProcessors();
         final long fileSize = Files.size(FILE);
         long regionSize = fileSize / concurrency;
 
@@ -353,30 +478,36 @@ public static void main(String[] args) throws IOException, InterruptedException
 
         long startPos = 0;
         final FileChannel channel = (FileChannel) Files.newByteChannel(FILE, StandardOpenOption.READ);
+        // get the memory address, this is the only thing we need for Unsafe
+        final long memoryAddress = channel.map(FileChannel.MapMode.READ_ONLY, startPos, fileSize, Arena.global()).address();
+
         final RegionActor[] actors = new RegionActor[concurrency];
         for (int i = 0; i < concurrency; i++) {
             // calculate boundaries
             long maxSize = (startPos + regionSize > fileSize) ? fileSize - startPos : regionSize;
             // shift position to back until we find a linebreak
-            maxSize = findClosestLineEnd(startPos, (int) maxSize, channel);
+            maxSize = findClosestLineEnd(memoryAddress + startPos, (int) maxSize);
 
-            final RegionActor region = (actors[i] = new RegionActor(channel, startPos, (int) maxSize));
-            region.accumulate();
+            final RegionActor region = (actors[i] = new RegionActor(memoryAddress + startPos, (int) maxSize));
+            region.start(); // start processing
 
             startPos += maxSize;
         }
 
-        final RecordMap output = new RecordMap(); // output to merge all regions
+        final RecordMap output = new RecordMap(); // output to merge all records
         for (RegionActor actor : actors) {
             final RecordMap partial = actor.get(); // blocks until get the result
             output.merge(partial);
+            // System.out.println("collisions: " + partial.collision);
         }
 
         // sort and print the result
         final TreeMap<String, String> sorted = new TreeMap<>();
-        output.forEach(key -> sorted.put(key.toString(), key.measurements()));
+        output.forEach(key -> {
+            sorted.put(key.toString(), key.measurements());
+        });
         System.out.println(sorted);
-
+        System.out.close(); // closing the stream will trigger the main process to pick up the output early
     }
 
 }

From 1281e77be4baf9f49f068098bc6fce4071e40b91 Mon Sep 17 00:00:00 2001
From: giovannicuccu <giovanni.cuccu@gmail.com>
Date: Mon, 29 Jan 2024 21:12:05 +0100
Subject: [PATCH 195/268] new version by Giovanni Cuccu  (#640)

* Solution without unsafe

* Solution without unsafe

* Solution without unsafe, remove the usage of bytebuffer, passes the create_measurements3 test

* bug fix for 10k test, update also the CreateMeasurements3.java to use '\n' as newline instead of the os value (if it runs on windows it uses crlf and "breaks" the file format )

* new version that should perform way better than the previous one

* removed prepare script for giovannicuccu

* removed some comments

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
---
 calculate_average_giovannicuccu.sh            |   0
 prepare_giovannicuccu.sh                      |  20 ---
 .../CalculateAverage_giovannicuccu.java       | 115 ++++++++----------
 3 files changed, 53 insertions(+), 82 deletions(-)
 mode change 100755 => 100644 calculate_average_giovannicuccu.sh
 delete mode 100755 prepare_giovannicuccu.sh

diff --git a/calculate_average_giovannicuccu.sh b/calculate_average_giovannicuccu.sh
old mode 100755
new mode 100644
diff --git a/prepare_giovannicuccu.sh b/prepare_giovannicuccu.sh
deleted file mode 100755
index 4cda7b411..000000000
--- a/prepare_giovannicuccu.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-#
-#  Copyright 2023 The original authors
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-# Uncomment below to use sdk
-# source "$HOME/.sdkman/bin/sdkman-init.sh"
-# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java b/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
index 7123c2c14..cd9591f1a 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java
@@ -185,15 +185,16 @@ private static class MeasurementListVectorized {
 
         private final MemorySegment dataSegment = MemorySegment.ofArray(keyData);
 
-        public void addWithByteVector(ByteVector chunk1, int len, int hash, int value, MemorySegment memorySegment, long offset) {
+        private final byte[] lineData = new byte[SIZE];
+
+        private final MemorySegment lineSegment = MemorySegment.ofArray(lineData);
+
+        public void add(int len, int hash, int value, MemorySegment memorySegment, long offset) {
+            MemorySegment.copy(memorySegment, offset, lineSegment, 0, len);
             int index = hash & (SIZE - 1);
-            int i = 0;
             while (measurements[index] != null) {
-                if (measurements[index].getLen() == len && measurements[index].getHash() == hash) {
-                    var nodeKey = ByteVector.fromArray(BYTE_SPECIES, keyData, index * KEY_SIZE);
-                    long eqMask = chunk1.compare(VectorOperators.EQ, nodeKey).toLong();
-                    long validMask = -1L >>> (64 - len);
-                    if ((eqMask & validMask) == validMask) {
+                if (measurements[index].getHash() == hash && measurements[index].getLen() == len) {
+                    if (Arrays.equals(keyData, index * KEY_SIZE, index * KEY_SIZE + len, lineData, 0, len)) {
                         measurements[index].add(value);
                         return;
                     }
@@ -204,15 +205,14 @@ public void addWithByteVector(ByteVector chunk1, int len, int hash, int value, M
             measurements[index] = new MeasurementAggregatorVectorized(keyData, index * KEY_SIZE, len, hash, value);
         }
 
-        public void add(int len, int hash, int value, MemorySegment memorySegment, long offset) {
+        public void addWithByteVector(ByteVector chunk1, int len, int hash, int value, MemorySegment memorySegment, long offset) {
             int index = hash & (SIZE - 1);
             while (measurements[index] != null) {
                 if (measurements[index].getLen() == len && measurements[index].getHash() == hash) {
-                    int i = 0;
-                    while (i < len && keyData[index * KEY_SIZE + i] == memorySegment.get(ValueLayout.JAVA_BYTE, offset + i)) {
-                        i++;
-                    }
-                    if (i == len) {
+                    var nodeKey = ByteVector.fromArray(BYTE_SPECIES, keyData, index * KEY_SIZE);
+                    long eqMask = chunk1.compare(VectorOperators.EQ, nodeKey).toLong();
+                    long validMask = -1L >>> (64 - len);
+                    if ((eqMask & validMask) == validMask) {
                         measurements[index].add(value);
                         return;
                     }
@@ -248,8 +248,6 @@ private static class MMapReaderMemorySegment {
         private final Path path;
         private final List<PartitionBoundary> boundaries;
         private final boolean serial;
-        private static final byte SEPARATOR = ';';
-        ByteVector separators = ByteVector.broadcast(BYTE_SPECIES, SEPARATOR);
         private static final ValueLayout.OfLong JAVA_LONG_LT = ValueLayout.JAVA_LONG_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
 
         public MMapReaderMemorySegment(Path path, PartitionCalculator partitionCalculator, boolean serial) {
@@ -311,6 +309,12 @@ private void merge(MeasurementListVectorized result, MeasurementListVectorized p
             }
         }
 
+        private final long ALL_ONE = -1L;
+        private static final long DELIMITER_MASK = 0x3B3B3B3B3B3B3B3BL;
+
+        private static final byte SEPARATOR = ';';
+        private final static ByteVector SEPARATORS = ByteVector.broadcast(BYTE_SPECIES, SEPARATOR);
+
         private MeasurementListVectorized computeListForPartition(FileChannel fileChannel, PartitionBoundary boundary) {
             try (var arena = Arena.ofConfined()) {
                 var memorySegment = fileChannel.map(FileChannel.MapMode.READ_ONLY, boundary.start(), boundary.end() - boundary.start(), arena);
@@ -318,58 +322,50 @@ private MeasurementListVectorized computeListForPartition(FileChannel fileChanne
                 long size = memorySegment.byteSize();
                 long offset = 0;
                 long safe = size - KEY_SIZE;
-                // ByteBuffer byteBuffer = memorySegment.asByteBuffer();
-                // byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
-                ByteVector chunk1 = ByteVector.zero(BYTE_SPECIES);
-                ByteVector chunk2 = ByteVector.zero(BYTE_SPECIES);
                 while (offset < safe) {
                     int len = 0;
-                    chunk1 = ByteVector.fromMemorySegment(BYTE_SPECIES, memorySegment, offset, NATIVE_ORDER);
-                    int equals = chunk1.compare(VectorOperators.EQ, separators).firstTrue();
-                    len += equals;
-                    if (equals == BYTE_SPECIES_LANES) {
-                        while (memorySegment.get(ValueLayout.JAVA_BYTE, offset + len) != ';') {
-                            len++;
+                    var line = ByteVector.fromMemorySegment(BYTE_SPECIES, memorySegment, offset, NATIVE_ORDER);
+                    len = line.compare(VectorOperators.EQ, SEPARATORS).firstTrue();
+                    if (len == BYTE_SPECIES_LANES) {
+                        int position1 = -1;
+                        int incr = BYTE_SPECIES_LANES;
+                        while (position1 == -1) {
+                            long readBuffer = memorySegment.get(JAVA_LONG_LT, offset + incr);
+                            long comparisonResult1 = (readBuffer ^ DELIMITER_MASK);
+                            long highBitMask1 = (comparisonResult1 - 0x0101010101010101L) & (~comparisonResult1 & 0x8080808080808080L);
+
+                            boolean noContent1 = highBitMask1 == 0;
+                            position1 = noContent1 ? -1 : Long.numberOfTrailingZeros(highBitMask1) >> 3;
+                            len += noContent1 ? 8 : position1;
+                            incr += 8;
                         }
-                    }
+                        int hash = hash(memorySegment, offset, len);
+                        long prevOffset = offset;
+                        offset += len + 1;
 
-                    int hash = hash(memorySegment, offset, len);
-                    long prevOffset = offset;
-                    offset += len + 1;
-
-                    long numberWord = memorySegment.get(JAVA_LONG_LT, offset);
-                    int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
-                    int value = convertIntoNumber(decimalSepPos, numberWord);
-                    offset += (decimalSepPos >>> 3) + 3;
-                    // System.out.println("Value=" + value);
-                    if (len < BYTE_SPECIES_LANES) {
-                        list.addWithByteVector(chunk1, len, hash, value, memorySegment, prevOffset);
+                        long numberWord = memorySegment.get(JAVA_LONG_LT, offset);
+                        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                        int value = convertIntoNumber(decimalSepPos, numberWord);
+                        offset += (decimalSepPos >>> 3) + 3;
+                        list.add(len, hash, value, memorySegment, prevOffset);
                     }
                     else {
-                        list.add(len, hash, value, memorySegment, prevOffset);
+                        int hash = hash(memorySegment, offset, len);
+                        long prevOffset = offset;
+                        offset += len + 1;
+
+                        long numberWord = memorySegment.get(JAVA_LONG_LT, offset);
+                        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                        int value = convertIntoNumber(decimalSepPos, numberWord);
+                        offset += (decimalSepPos >>> 3) + 3;
+                        list.addWithByteVector(line, len, hash, value, memorySegment, prevOffset);
                     }
                 }
 
                 while (offset < size) {
                     int len = 0;
-                    int equals = BYTE_SPECIES_LANES;
-                    if (offset + BYTE_SPECIES_LANES < size) {
-                        chunk1 = ByteVector.fromMemorySegment(BYTE_SPECIES, memorySegment, offset, NATIVE_ORDER);
-                        equals = chunk1.compare(VectorOperators.EQ, separators).firstTrue();
-                        len += equals;
-                        if (equals == BYTE_SPECIES_LANES) {
-                            while (memorySegment.get(ValueLayout.JAVA_BYTE, offset + len) != ';') {
-                                len++;
-                            }
-                        }
-                    }
-                    else {
-                        byte[] bytes = new byte[BYTE_SPECIES_LANES];
-                        MemorySegment.copy(memorySegment, offset + len, MemorySegment.ofArray(bytes), 0, (size - offset - len));
-                        // byteBuffer.get(offset + len, bytes, 0, (int) (size - offset - len));
-                        chunk1 = ByteVector.fromArray(BYTE_SPECIES, bytes, 0);
-                        equals = chunk1.compare(VectorOperators.EQ, separators).firstTrue();
-                        len += equals;
+                    while (memorySegment.get(ValueLayout.JAVA_BYTE, offset + len) != ';') {
+                        len++;
                     }
                     int hash = hash(memorySegment, offset, len);
                     long prevOffset = offset;
@@ -402,12 +398,7 @@ private MeasurementListVectorized computeListForPartition(FileChannel fileChanne
                         }
                         offset = currentPosition + 2;
                     }
-                    if (len < BYTE_SPECIES_LANES) {
-                        list.addWithByteVector(chunk1, len, hash, value, memorySegment, prevOffset);
-                    }
-                    else {
-                        list.add(len, hash, value, memorySegment, prevOffset);
-                    }
+                    list.add(len, hash, value, memorySegment, prevOffset);
                 }
                 return list;
             }

From 31a6740ef1a376ff086a337060aa5ed0468a7b26 Mon Sep 17 00:00:00 2001
From: Panagiotis Drakatos <PanagiotisDrakatos@users.noreply.github.com>
Date: Mon, 29 Jan 2024 22:16:40 +0200
Subject: [PATCH 196/268] New Fresh Solution to Optimize Execution time (#641)

* CalculateAverage_pdrakatos

* Rename to be valid with rules

* CalculateAverage_pdrakatos

* Rename to be valid with rules

* Changes on scripts execution

* Fixing bugs causing scripts not to be executed

* Changes on prepare make it compatible

* Fixing passing all tests

* Increase direct memory allocation buffer

* Fixing memory problem causes heap space exception

* Fresh solution to optimize performance of the execution
---
 .../CalculateAverage_PanagiotisDrakatos.java  | 85 ++++++++++++-------
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java b/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
index ecf2b700d..9ab7a2264 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
@@ -15,8 +15,10 @@
  */
 package dev.morling.onebrc;
 
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
@@ -26,18 +28,27 @@
 import java.util.stream.StreamSupport;
 
 public class CalculateAverage_PanagiotisDrakatos {
+
     private static final String FILE = "./measurements.txt";
+    private static final long SEGMENT_SIZE = 4 * 1024 * 1024;
+    private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
+    private static final long DOT_BITS = 0x10101000;
+    private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
+
     private static TreeMap<String, MeasurementObject> sortedCities;
 
     public static void main(String[] args) throws IOException {
         SeekableByteRead(FILE);
         System.out.println(sortedCities);
+        boolean DEBUG = true;
     }
 
     private static void SeekableByteRead(String path) throws IOException {
-        FileInputStream fileInputStream = new FileInputStream(FILE);
+        FileInputStream fileInputStream = new FileInputStream(new File(FILE));
         FileChannel fileChannel = fileInputStream.getChannel();
-        Optional<Map<String, MeasurementObject>> optimistic = SplitSeekableByteChannel(fileChannel)
+        Optional<Map<String, MeasurementObject>> optimistic = getFileSegments(new File(FILE), fileChannel)
+                .stream()
+                .map(CalculateAverage_PanagiotisDrakatos::SplitSeekableByteChannel)
                 .parallel()
                 .map(CalculateAverage_PanagiotisDrakatos::MappingByteBufferToData)
                 .reduce(CalculateAverage_PanagiotisDrakatos::combineMaps);
@@ -46,37 +57,53 @@ private static void SeekableByteRead(String path) throws IOException {
 
     }
 
-    private static Stream<ByteBuffer> SplitSeekableByteChannel(FileChannel channel) throws IOException {
-        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new Iterator<ByteBuffer>() {
-            private static final long MAP_SIZE = 1024 * 1024 * 10L;
-
-            private long position = 0;
-            private long length = channel.size();
+    record FileSegment(long start, long end, FileChannel fileChannel) {
+    }
 
-            @Override
-            public boolean hasNext() {
-                while (position < length) {
-                    return true;
-                }
-                return false;
+    private static List<FileSegment> getFileSegments(final File file, final FileChannel fileChannel) throws IOException {
+        final int numberOfSegments = Runtime.getRuntime().availableProcessors();
+        final long fileSize = file.length();
+        final long segmentSize = fileSize / numberOfSegments;
+        final List<FileSegment> segments = new ArrayList<>();
+        if (segmentSize < 1000) {
+            segments.add(new FileSegment(0, fileSize, fileChannel));
+            return segments;
+        }
+        try (RandomAccessFile randomAccessFile = new RandomAccessFile(file, "r")) {
+            long segStart = 0;
+            long segEnd = segmentSize;
+            while (segStart < fileSize) {
+                segEnd = findSegment(randomAccessFile, segEnd, fileSize);
+                segments.add(new FileSegment(segStart, segEnd, fileChannel));
+                segStart = segEnd; // Just re-use the end and go from there.
+                segEnd = Math.min(fileSize, segEnd + segmentSize);
             }
+        }
+        return segments;
+    }
 
-            @Override
-            public ByteBuffer next() {
-                try {
-                    MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, position, Math.min(MAP_SIZE, length - position));
-                    int end = buffer.limit() - 1;
-                    while (buffer.get(end) != '\n') {
-                        end--;
-                    }
-                    position += end + 1;
-                    return buffer.slice(0, end);
-                }
-                catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
+    private static long findSegment(RandomAccessFile raf, long location, final long fileSize) throws IOException {
+        raf.seek(location);
+        while (location < fileSize) {
+            location++;
+            if (raf.read() == '\n')
+                return location;
+        }
+        return location;
+    }
+
+    private static ByteBuffer SplitSeekableByteChannel(FileSegment segment) {
+        try {
+            MappedByteBuffer buffer = segment.fileChannel.map(FileChannel.MapMode.READ_ONLY, segment.start(), segment.end - segment.start());
+            int end = buffer.limit() - 1;
+            while (buffer.get(end) != '\n') {
+                end--;
             }
-        }, Spliterator.IMMUTABLE), false);
+            return buffer.slice(0, end);
+        }
+        catch (Exception ex) {
+            throw new RuntimeException(ex);
+        }
     }
 
     public static ByteBuffer concat(ByteBuffer[] buffers) {

From 8e407ca79dc9c2f51b096f95687306103258bf75 Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Tue, 30 Jan 2024 05:21:04 +0900
Subject: [PATCH 197/268] apply loop unroll trick (#643)

* apply loop unroll trick

* less assign op, a bit faster
---
 prepare_abeobk.sh                             |   4 +-
 .../onebrc/CalculateAverage_abeobk.java       | 308 ++++++++++--------
 2 files changed, 181 insertions(+), 131 deletions(-)

diff --git a/prepare_abeobk.sh b/prepare_abeobk.sh
index 1b7374383..08a8afdcb 100755
--- a/prepare_abeobk.sh
+++ b/prepare_abeobk.sh
@@ -20,8 +20,6 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_abeobk_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -dsa -march=native -R:MaxHeapSize=128m -H:-GenLoopSafepoints -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -dsa -march=native -H:InlineAllBonus=10 -H:-GenLoopSafepoints -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
 fi
-
-
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index c08a9d86c..2340bca79 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -98,21 +98,21 @@ final String key() {
             return new String(sbuf, 0, (int) keylen, StandardCharsets.UTF_8);
         }
 
-        Node(long a, long t, int kl, long h, long val) {
+        Node(long a, long t, int kl, long h) {
             addr = a;
             tail = t;
-            sum = min = max = val;
-            count = 1;
+            min = 999;
+            max = -999;
             keylen = kl;
             hash = h;
         }
 
-        Node(long a, long w0, long t, int kl, long h, long val) {
+        Node(long a, long w0, long t, int kl, long h) {
             addr = a;
             word0 = w0;
+            min = 999;
+            max = -999;
             tail = t;
-            sum = min = max = val;
-            count = 1;
             keylen = kl;
             hash = h;
         }
@@ -120,9 +120,8 @@ final String key() {
         final void add(long val) {
             sum += val;
             count++;
-            if (val >= max) {
+            if (val > max) {
                 max = val;
-                return;
             }
             if (val < min) {
                 min = val;
@@ -170,25 +169,141 @@ static final long getSemiPosCode(final long word) {
         return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
     }
 
+    static final long getLFCode(final long word) {
+        long xor_semi = word ^ 0x0A0A0A0A0A0A0A0AL; // xor with \n\n\n\n\n\n\n\n
+        return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
+    }
+
+    static final long nextLine(long addr) {
+        long word = UNSAFE.getLong(addr);
+        long lfpos_code = getLFCode(word);
+        while (lfpos_code == 0) {
+            addr += 8;
+            word = UNSAFE.getLong(addr);
+            lfpos_code = getLFCode(word);
+        }
+        return addr + (Long.numberOfTrailingZeros(lfpos_code) >>> 3) + 1;
+    }
+
     // speed/collision balance
     static final long xxh32(long hash) {
         long h = hash * 37;
         return (h ^ (h >>> 29));
     }
 
-    // great idea from merykitty (Quan Anh Mai)
-    static final long parseNum(long num_word, int dot_pos) {
-        int shift = 28 - dot_pos;
-        long signed = (~num_word << 59) >> 63;
-        long dsmask = ~(signed & 0xFF);
-        long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
-        long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-        return ((abs_val ^ signed) - signed);
+    static final class ChunkParser {
+        long addr;
+        long end;
+        Node[] map;
+
+        ChunkParser(Node[] m, long a, long e) {
+            map = m;
+            addr = a;
+            end = e;
+        }
+
+        final boolean ok() {
+            return addr < end;
+        }
+
+        final long word() {
+            return UNSAFE.getLong(addr);
+        }
+
+        final long val() {
+            long num_word = UNSAFE.getLong(addr);
+            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
+            addr += (dot_pos >>> 3) + 3;
+            // great idea from merykitty (Quan Anh Mai)
+            int shift = 28 - dot_pos;
+            long signed = (~num_word << 59) >> 63;
+            long dsmask = ~(signed & 0xFF);
+            long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
+            long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            return ((abs_val ^ signed) - signed);
+        }
+
+        // optimize for contest
+        // save as much slow memory access as possible
+        // about 50% key < 8chars, 25% key bettween 8-10 chars
+        // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
+        final Node key(long word0, long semipos_code) {
+            long row_addr = addr;
+            // about 50% chance key < 8 chars
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                addr += semi_pos + 1;
+                long tail = word0 & HASH_MASKS[semi_pos];
+                long hash = xxh32(tail);
+                int bucket = (int) (hash & BUCKET_MASK);
+                while (true) {
+                    Node node = map[bucket];
+                    if (node == null) {
+                        return (map[bucket] = new Node(row_addr, tail, semi_pos, hash));
+                    }
+                    if (node.tail == tail) {
+                        return node;
+                    }
+                    bucket++;
+                }
+            }
+
+            addr += 8;
+            long word = UNSAFE.getLong(addr);
+            semipos_code = getSemiPosCode(word);
+            // 43% chance
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                addr += semi_pos + 1;
+                long tail = (word & HASH_MASKS[semi_pos]);
+                long hash = xxh32(word0 ^ tail);
+                int bucket = (int) (hash & BUCKET_MASK);
+                while (true) {
+                    Node node = map[bucket];
+                    if (node == null) {
+                        return (map[bucket] = new Node(row_addr, word0, tail, semi_pos + 8, hash));
+                    }
+                    if (node.word0 == word0 && node.tail == tail) {
+                        return node;
+                    }
+                    bucket++;
+                }
+            }
+
+            // why not going for more? tested, slower
+            long hash = word0;
+            while (semipos_code == 0) {
+                hash ^= word;
+                addr += 8;
+                word = UNSAFE.getLong(addr);
+                semipos_code = getSemiPosCode(word);
+            }
+
+            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+            addr += semi_pos;
+            long keylen = addr - row_addr;
+            addr++;
+            long tail = (word & HASH_MASKS[semi_pos]);
+            hash = xxh32(hash ^ tail);
+            int bucket = (int) (hash & BUCKET_MASK);
+
+            while (true) {
+                Node node = map[bucket];
+                if (node == null) {
+                    return (map[bucket] = new Node(row_addr, word0, tail, (int) keylen, hash));
+                }
+                if (node.contentEquals(row_addr, word0, tail, keylen)) {
+                    return node;
+                }
+                bucket++;
+            }
+        }
     }
 
     // Thread pool worker
     static final class Worker extends Thread {
         final int thread_id; // for debug use only
+        int cls = 0;
 
         Worker(int i) {
             thread_id = i;
@@ -198,9 +313,8 @@ static final class Worker extends Thread {
         @Override
         public void run() {
             var map = new Node[BUCKET_SIZE + MAX_STATIONS]; // extra space for collisions
-            int id;
-            int cls = 0;
 
+            int id;
             // process in small chunk to maintain disk locality (artsiomkorzun trick)
             while ((id = chunk_id.getAndIncrement()) < chunk_cnt) {
                 long addr = start_addr + id * CHUNK_SZ;
@@ -208,119 +322,57 @@ public void run() {
 
                 // find start of line
                 if (id > 0) {
-                    while (UNSAFE.getByte(addr++) != '\n')
-                        ;
+                    addr = nextLine(addr);
                 }
 
-                // parse loop
-                // optimize for contest
-                // save as much slow memory access as possible
-                // about 50% key < 8chars, 25% key bettween 8-10 chars
-                // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
-                while (addr < end) {
-                    long row_addr = addr;
-
-                    long word0 = UNSAFE.getLong(addr);
-                    long semipos_code = getSemiPosCode(word0);
-
-                    // about 50% chance key < 8 chars
-                    if (semipos_code != 0) {
-                        int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                        addr += semi_pos + 1;
-                        long num_word = UNSAFE.getLong(addr);
-                        int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                        addr += (dot_pos >>> 3) + 3;
-
-                        long tail = word0 & HASH_MASKS[semi_pos];
-                        long hash = xxh32(tail);
-                        int bucket = (int) (hash & BUCKET_MASK);
-                        long val = parseNum(num_word, dot_pos);
-
-                        while (true) {
-                            var node = map[bucket];
-                            if (node == null) {
-                                map[bucket] = new Node(row_addr, tail, semi_pos, hash, val);
-                                break;
-                            }
-                            if (node.tail == tail) {
-                                node.add(val);
-                                break;
-                            }
-                            bucket++;
-                            if (SHOW_ANALYSIS)
-                                cls++;
-                        }
-                        continue;
-                    }
-
-                    addr += 8;
-                    long word = UNSAFE.getLong(addr);
-                    semipos_code = getSemiPosCode(word);
-                    // 43% chance
-                    if (semipos_code != 0) {
-                        int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                        addr += semi_pos + 1;
-                        long num_word = UNSAFE.getLong(addr);
-                        int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                        addr += (dot_pos >>> 3) + 3;
-
-                        long tail = (word & HASH_MASKS[semi_pos]);
-                        long hash = xxh32(word0 ^ tail);
-                        int bucket = (int) (hash & BUCKET_MASK);
-                        long val = parseNum(num_word, dot_pos);
-
-                        while (true) {
-                            var node = map[bucket];
-                            if (node == null) {
-                                map[bucket] = new Node(row_addr, word0, tail, semi_pos + 8, hash, val);
-                                break;
-                            }
-                            if (node.word0 == word0 && node.tail == tail) {
-                                node.add(val);
-                                break;
-                            }
-                            bucket++;
-                            if (SHOW_ANALYSIS)
-                                cls++;
-                        }
-                        continue;
-                    }
-
-                    // why not going for more? tested, slower
-                    long hash = word0;
-                    while (semipos_code == 0) {
-                        hash ^= word;
-                        addr += 8;
-                        word = UNSAFE.getLong(addr);
-                        semipos_code = getSemiPosCode(word);
-                    }
+                final int num_segs = 3;
+                long seglen = (end - addr) / num_segs;
+
+                long a0 = addr;
+                long a1 = nextLine(addr + 1 * seglen);
+                long a2 = nextLine(addr + 2 * seglen);
+                ChunkParser p0 = new ChunkParser(map, a0, a1);
+                ChunkParser p1 = new ChunkParser(map, a1, a2);
+                ChunkParser p2 = new ChunkParser(map, a2, end);
+
+                while (p0.ok() && p1.ok() && p2.ok()) {
+                    long w0 = p0.word();
+                    long w1 = p1.word();
+                    long w2 = p2.word();
+                    long sc0 = getSemiPosCode(w0);
+                    long sc1 = getSemiPosCode(w1);
+                    long sc2 = getSemiPosCode(w2);
+                    Node n0 = p0.key(w0, sc0);
+                    Node n1 = p1.key(w1, sc1);
+                    Node n2 = p2.key(w2, sc2);
+                    long v0 = p0.val();
+                    long v1 = p1.val();
+                    long v2 = p2.val();
+                    n0.add(v0);
+                    n1.add(v1);
+                    n2.add(v2);
+                }
 
-                    int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                    addr += semi_pos;
-                    long keylen = addr - row_addr;
-                    long num_word = UNSAFE.getLong(addr + 1);
-                    int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-                    addr += (dot_pos >>> 3) + 4;
-
-                    long tail = (word & HASH_MASKS[semi_pos]);
-                    hash = xxh32(hash ^ tail);
-                    int bucket = (int) (hash & BUCKET_MASK);
-                    long val = parseNum(num_word, dot_pos);
-
-                    while (true) {
-                        var node = map[bucket];
-                        if (node == null) {
-                            map[bucket] = new Node(row_addr, word0, tail, (int) keylen, hash, val);
-                            break;
-                        }
-                        if (node.contentEquals(row_addr, word0, tail, keylen)) {
-                            node.add(val);
-                            break;
-                        }
-                        bucket++;
-                        if (SHOW_ANALYSIS)
-                            cls++;
-                    }
+                while (p0.ok()) {
+                    long w = p0.word();
+                    long sc = getSemiPosCode(w);
+                    Node n = p0.key(w, sc);
+                    long v = p0.val();
+                    n.add(v);
+                }
+                while (p1.ok()) {
+                    long w = p1.word();
+                    long sc = getSemiPosCode(w);
+                    Node n = p1.key(w, sc);
+                    long v = p1.val();
+                    n.add(v);
+                }
+                while (p2.ok()) {
+                    long w = p2.word();
+                    long sc = getSemiPosCode(w);
+                    Node n = p2.key(w, sc);
+                    long v = p2.val();
+                    n.add(v);
                 }
             }
 

From f4a0039a591fc7c02306af5fc7a8fbca8a292668 Mon Sep 17 00:00:00 2001
From: tivrfoa <lescoutinhovr@gmail.com>
Date: Mon, 29 Jan 2024 17:24:04 -0300
Subject: [PATCH 198/268] Try more chunks than threads, and of different sizes
 (#644)

/**
 * Solution based on thomaswue solution, commit:
 * commit d0a28599c293d3afe3291fc3cf169a7b25ae9ae6
 * Author: Thomas Wuerthinger
 * Date:   Sun Jan 21 20:13:48 2024 +0100
 *
 * The goal here was to try to improve the runtime of his 10k
 * solution of: 00:04.516
 *
 * With Thomas latest changes, his time is probably much better
 * already, and maybe even 1st place for the 10k too.
 * See: https://github.com/gunnarmorling/1brc/pull/606
 *
 * But as I was already coding something, I'll submit just to
 * see if it will be faster than his *previous* 10k time of
 * 00:04.516
 *
 * Changes:
 *   It's a similar idea of my previous solution, that if you split
 * the chunks evenly, some threads might finish much faster and
 * stay idle, so:
 *   1) Create more chunks than threads, so the ones that finish first
 * can do something;
 *   2) Decrease chunk sizes as we get closer to the end of the file.
 */
---
 prepare_tivrfoa.sh                            |   2 +-
 .../onebrc/CalculateAverage_tivrfoa.java      | 364 ++++++++++--------
 2 files changed, 197 insertions(+), 169 deletions(-)

diff --git a/prepare_tivrfoa.sh b/prepare_tivrfoa.sh
index 7cbf309e5..024d6f984 100755
--- a/prepare_tivrfoa.sh
+++ b/prepare_tivrfoa.sh
@@ -20,7 +20,7 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_tivrfoa_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_tivrfoa\$Scanner"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:-GenLoopSafepoints -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_tivrfoa\$Scanner"
     # Use -H:MethodFilter=CalculateAverage_tivrfoa.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_tivrfoa_image dev.morling.onebrc.CalculateAverage_tivrfoa
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java b/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
index a1b48441f..54f13cbea 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
@@ -23,31 +23,35 @@
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.*;
-import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
 
 /**
  * Solution based on thomaswue solution, commit:
  * commit d0a28599c293d3afe3291fc3cf169a7b25ae9ae6
- * Author: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
+ * Author: Thomas Wuerthinger
  * Date:   Sun Jan 21 20:13:48 2024 +0100
  *
+ * The goal here was to try to improve the runtime of his 10k
+ * solution of: 00:04.516
+ * 
+ * With Thomas latest changes, his time is probably much better
+ * already, and maybe even 1st place for the 10k too.
+ * See: https://github.com/gunnarmorling/1brc/pull/606
+ * 
+ * But as I was already coding something, I'll submit just to
+ * see if it will be faster than his *previous* 10k time of
+ * 00:04.516
+ * 
  * Changes:
- *   1) Use LinkedBlockingQueue to store partial results, that
- *   will then be merged into the final map later.
- *   As different chunks finish at different times, this allows
- *   to process them as they finish, instead of joining the
- *   threads sequentially.
- *     This change seems more useful for the 10k dataset, as the
- *   runtime difference of each chunk is greater.
- *   2) Use only 4 threads if the file is >= 14GB.
- *   This showed much better results on my local test, but I only
- *   run with 200 million rows (because of limited RAM), and I have
- *   no idea how it will perform on the 1brc HW.
+ *   It's a similar idea of my previous solution, that if you split
+ * the chunks evenly, some threads might finish much faster and
+ * stay idle, so:
+ *   1) Create more chunks than threads, so the ones that finish first
+ * can do something;
+ *   2) Decrease chunk sizes as we get closer to the end of the file.
  */
 public class CalculateAverage_tivrfoa {
     private static final String FILE = "./measurements.txt";
-    private static LinkedBlockingQueue<List<Result>> partialResultQueue;
-    private static int C = 10_000;
     private static final int MIN_TEMP = -999;
     private static final int MAX_TEMP = 999;
 
@@ -95,8 +99,16 @@ public String calcName() {
         }
     }
 
+    private static final int NUM_CPUS = Runtime.getRuntime().availableProcessors();
+    private static final AtomicInteger chunkIdx = new AtomicInteger();
+    private static long[] chunks;
+    private static int numChunks;
+
     private static final class SolveChunk extends Thread {
         private long chunkStart, chunkEnd;
+        private Result[] results = new Result[10_000];
+        private Result[] buckets = new Result[1 << 17];
+        private int resIdx = 0;
 
         public SolveChunk(long chunkStart, long chunkEnd) {
             this.chunkStart = chunkStart;
@@ -105,12 +117,132 @@ public SolveChunk(long chunkStart, long chunkEnd) {
 
         @Override
         public void run() {
-            try {
-                partialResultQueue.put(parseLoop(chunkStart, chunkEnd));
+            parseLoop();
+            int chunk = chunkIdx.getAndIncrement();
+            if (chunk < numChunks) {
+                chunkStart = chunks[chunk];
+                chunkEnd = chunks[chunk + 1];
+                run();
             }
-            catch (Exception e) {
-                e.printStackTrace();
-                System.exit(1);
+        }
+
+        private void parseLoop() {
+            Scanner scanner = new Scanner(chunkStart, chunkEnd);
+            long word = scanner.getLong();
+            long pos = findDelimiter(word);
+            while (scanner.hasNext()) {
+                long nameAddress = scanner.pos();
+                long hash = 0;
+
+                // Search for ';', one long at a time.
+                if (pos != 0) {
+                    pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                    scanner.add(pos);
+                    word = mask(word, pos);
+                    hash = word;
+
+                    int number = scanNumber(scanner);
+                    long nextWord = scanner.getLong();
+                    long nextPos = findDelimiter(nextWord);
+
+                    Result existingResult = buckets[hashToIndex(hash, buckets)];
+                    if (existingResult != null && existingResult.lastNameLong == word) {
+                        word = nextWord;
+                        pos = nextPos;
+                        record(existingResult, number);
+                        continue;
+                    }
+
+                    scanner.setPos(nameAddress + pos);
+                }
+                else {
+                    scanner.add(8);
+                    hash = word;
+                    long prevWord = word;
+                    word = scanner.getLong();
+                    pos = findDelimiter(word);
+                    if (pos != 0) {
+                        pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                        scanner.add(pos);
+                        word = mask(word, pos);
+                        hash ^= word;
+
+                        Result existingResult = buckets[hashToIndex(hash, buckets)];
+                        if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
+                            int number = scanNumber(scanner);
+                            word = scanner.getLong();
+                            pos = findDelimiter(word);
+                            record(existingResult, number);
+                            continue;
+                        }
+                    }
+                    else {
+                        scanner.add(8);
+                        hash ^= word;
+                        while (true) {
+                            word = scanner.getLong();
+                            pos = findDelimiter(word);
+                            if (pos != 0) {
+                                pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                                scanner.add(pos);
+                                word = mask(word, pos);
+                                hash ^= word;
+                                break;
+                            }
+                            else {
+                                scanner.add(8);
+                                hash ^= word;
+                            }
+                        }
+                    }
+                }
+
+                // Save length of name for later.
+                int nameLength = (int) (scanner.pos() - nameAddress);
+                int number = scanNumber(scanner);
+
+                // Final calculation for index into hash table.
+                int tableIndex = hashToIndex(hash, buckets);
+                outer: while (true) {
+                    Result existingResult = buckets[tableIndex];
+                    if (existingResult == null) {
+                        existingResult = newEntry(buckets, nameAddress, tableIndex, nameLength, scanner);
+                        results[resIdx++] = existingResult;
+                    }
+                    // Check for collision.
+                    int i = 0;
+                    int namePos = 0;
+                    for (; i < nameLength + 1 - 8; i += 8) {
+                        if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
+                            tableIndex = (tableIndex + 31) & (buckets.length - 1);
+                            continue outer;
+                        }
+                    }
+
+                    int remainingShift = (64 - (nameLength + 1 - i) << 3);
+                    if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
+                        record(existingResult, number);
+                        break;
+                    }
+                    else {
+                        // Collision error, try next.
+                        tableIndex = (tableIndex + 31) & (buckets.length - 1);
+                    }
+                }
+
+                word = scanner.getLong();
+                pos = findDelimiter(word);
+            }
+        }
+    }
+
+    private static void mergeIntoFinalMap(TreeMap<String, Result> map, Result[] newResults) {
+        for (var r : newResults) {
+            if (r == null)
+                return;
+            Result current = map.putIfAbsent(r.calcName(), r);
+            if (current != null) {
+                current.add(r);
             }
         }
     }
@@ -127,20 +259,23 @@ public static void main(String[] args) throws Exception {
             spawnWorker();
             return;
         }
-        final int cpus = Runtime.getRuntime().availableProcessors();
-        final long[] chunks = getSegments(cpus);
-        final int workers = chunks.length - 1;
-        partialResultQueue = new LinkedBlockingQueue<>(workers);
-        final SolveChunk[] threads = new SolveChunk[workers];
-        for (int i = 0; i < workers; i++) {
+
+        chunks = getSegments(NUM_CPUS);
+        numChunks = chunks.length - 1;
+        final SolveChunk[] threads = new SolveChunk[NUM_CPUS];
+        chunkIdx.set(NUM_CPUS);
+        for (int i = 0; i < NUM_CPUS; i++) {
             threads[i] = new SolveChunk(chunks[i], chunks[i + 1]);
             threads[i].start();
         }
-        final TreeMap<String, Result> ret = new TreeMap<>();
-        for (int i = 0; i < workers; ++i) {
-            accumulateResults(ret, partialResultQueue.take());
+
+        TreeMap<String, Result> map = new TreeMap<>();
+        for (int i = 0; i < NUM_CPUS; ++i) {
+            threads[i].join();
+            mergeIntoFinalMap(map, threads[i].results);
         }
-        System.out.println(ret);
+
+        System.out.println(map);
         System.out.close();
     }
 
@@ -159,129 +294,6 @@ private static void spawnWorker() throws IOException {
                 .transferTo(System.out);
     }
 
-    private static void accumulateResults(TreeMap<String, Result> result, List<Result> newResult) {
-        for (Result r : newResult) {
-            String name = r.calcName();
-            Result current = result.putIfAbsent(name, r);
-            if (current != null) {
-                current.add(r);
-            }
-        }
-    }
-
-    // Main parse loop.
-    private static ArrayList<Result> parseLoop(long chunkStart, long chunkEnd) {
-        ArrayList<Result> ret = new ArrayList<>(C);
-        Result[] results = new Result[1 << 17];
-        Scanner scanner = new Scanner(chunkStart, chunkEnd);
-        long word = scanner.getLong();
-        long pos = findDelimiter(word);
-        while (scanner.hasNext()) {
-            long nameAddress = scanner.pos();
-            long hash = 0;
-
-            // Search for ';', one long at a time.
-            if (pos != 0) {
-                pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                scanner.add(pos);
-                word = mask(word, pos);
-                hash = word;
-
-                int number = scanNumber(scanner);
-                long nextWord = scanner.getLong();
-                long nextPos = findDelimiter(nextWord);
-
-                Result existingResult = results[hashToIndex(hash, results)];
-                if (existingResult != null && existingResult.lastNameLong == word) {
-                    word = nextWord;
-                    pos = nextPos;
-                    record(existingResult, number);
-                    continue;
-                }
-
-                scanner.setPos(nameAddress + pos);
-            }
-            else {
-                scanner.add(8);
-                hash = word;
-                long prevWord = word;
-                word = scanner.getLong();
-                pos = findDelimiter(word);
-                if (pos != 0) {
-                    pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                    scanner.add(pos);
-                    word = mask(word, pos);
-                    hash ^= word;
-
-                    Result existingResult = results[hashToIndex(hash, results)];
-                    if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
-                        int number = scanNumber(scanner);
-                        word = scanner.getLong();
-                        pos = findDelimiter(word);
-                        record(existingResult, number);
-                        continue;
-                    }
-                }
-                else {
-                    scanner.add(8);
-                    hash ^= word;
-                    while (true) {
-                        word = scanner.getLong();
-                        pos = findDelimiter(word);
-                        if (pos != 0) {
-                            pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                            scanner.add(pos);
-                            word = mask(word, pos);
-                            hash ^= word;
-                            break;
-                        }
-                        else {
-                            scanner.add(8);
-                            hash ^= word;
-                        }
-                    }
-                }
-            }
-
-            // Save length of name for later.
-            int nameLength = (int) (scanner.pos() - nameAddress);
-            int number = scanNumber(scanner);
-
-            // Final calculation for index into hash table.
-            int tableIndex = hashToIndex(hash, results);
-            outer: while (true) {
-                Result existingResult = results[tableIndex];
-                if (existingResult == null) {
-                    existingResult = newEntry(results, nameAddress, tableIndex, nameLength, scanner);
-                    ret.add(existingResult);
-                }
-                // Check for collision.
-                int i = 0;
-                int namePos = 0;
-                for (; i < nameLength + 1 - 8; i += 8) {
-                    if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
-                        tableIndex = (tableIndex + 31) & (results.length - 1);
-                        continue outer;
-                    }
-                }
-
-                int remainingShift = (64 - (nameLength + 1 - i) << 3);
-                if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
-                    record(existingResult, number);
-                    break;
-                }
-                else {
-                    // Collision error, try next.
-                    tableIndex = (tableIndex + 31) & (results.length - 1);
-                }
-            }
-
-            word = scanner.getLong();
-            pos = findDelimiter(word);
-        }
-        return ret;
-    }
-
     private static int scanNumber(Scanner scanPtr) {
         scanPtr.add(1);
         long numberWord = scanPtr.getLong();
@@ -356,28 +368,44 @@ private static Result newEntry(Result[] results, long nameAddress, int hash, int
         return r;
     }
 
+    /**
+     *  - Split 70% of the file in even chunks for all cpus;
+     *  - Create smaller chunks for the remainder of the file.  
+     */
     private static long[] getSegments(int cpus) throws IOException {
         try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
-            long fileSize = fileChannel.size();
-            int numberOfChunks = cpus / 2;
-            if (fileSize < (int) 14e9) {
-                C = 500;
-                numberOfChunks = cpus;
-            }
-            long segmentSize = (fileSize + numberOfChunks - 1) / numberOfChunks;
-            long[] chunks = new long[numberOfChunks + 1];
-            long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
+            final long fileSize = fileChannel.size();
+            final long part1 = (long) (fileSize * 0.7);
+            final long part2 = (long) (fileSize * 0.2);
+            final long part3 = fileSize - part1 - part2;
+            final long bigChunkSize = (part1 - 1) / cpus;
+            final long smallChunkSize1 = (part2 - 1) / (cpus * 3);
+            final long smallChunkSize2 = (part3 - 1) / (cpus * 3);
+            final int numChunks = cpus + cpus * 3 + cpus * 3;
+            final long[] sizes = new long[numChunks];
+            int l = 0, r = cpus;
+            Arrays.fill(sizes, l, r, bigChunkSize);
+            l = r;
+            r = l + cpus * 3;
+            Arrays.fill(sizes, l, r, smallChunkSize1);
+            l = r;
+            r = l + cpus * 3;
+            Arrays.fill(sizes, l, r, smallChunkSize2);
+            final long[] chunks = new long[sizes.length + 1];
+            final long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
             chunks[0] = mappedAddress;
-            long endAddress = mappedAddress + fileSize;
-            Scanner s = new Scanner(mappedAddress, mappedAddress + fileSize);
-            for (int i = 1; i < numberOfChunks; ++i) {
-                long chunkAddress = mappedAddress + i * segmentSize;
+            final long endAddress = mappedAddress + fileSize;
+            final Scanner s = new Scanner(mappedAddress, mappedAddress + fileSize);
+            for (int i = 1, sizeIdx = 0; i < chunks.length - 1; ++i, sizeIdx = (sizeIdx + 1) % sizes.length) {
+                long chunkAddress = chunks[i - 1] + sizes[sizeIdx];
                 // Align to first row start.
                 while (chunkAddress < endAddress && (s.getLongAt(chunkAddress++) & 0xFF) != '\n')
                     ;
                 chunks[i] = Math.min(chunkAddress, endAddress);
+                // System.err.printf("Chunk size %d\n", chunks[i] - chunks[i - 1]);
             }
-            chunks[numberOfChunks] = endAddress;
+            chunks[chunks.length - 1] = endAddress;
+            // System.err.printf("Chunk size %d\n", chunks[chunks.length - 1] - chunks[chunks.length - 2]);
             return chunks;
         }
     }
@@ -428,4 +456,4 @@ void setPos(long l) {
             this.pos = l;
         }
     }
-}
\ No newline at end of file
+}

From 7d52a37600359de931395e218c909eaaf901f690 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <sozal@catchpoint.com>
Date: Mon, 29 Jan 2024 23:27:06 +0300
Subject: [PATCH 199/268] serkan-ozal's 4th submission: (#645)

- split big regions into shared smaller tasks, so the workers complete their own tasks can pick up from the remaining instead of leaving its core idle
- reduce number of executed instructions in the hot path
---
 calculate_average_serkan-ozal.sh              |   2 +-
 .../onebrc/CalculateAverage_serkan_ozal.java  | 108 +++++++++++-------
 2 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/calculate_average_serkan-ozal.sh b/calculate_average_serkan-ozal.sh
index c075fc20b..cce366fca 100755
--- a/calculate_average_serkan-ozal.sh
+++ b/calculate_average_serkan-ozal.sh
@@ -26,7 +26,7 @@ if [[ ! "$(uname -s)" = "Darwin" ]]; then
   JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
 fi
 
-CONFIGS="USE_SHARED_ARENA=true USE_SHARED_REGION=true CLOSE_STDOUT_ON_RESULT=true"
+CONFIGS="USE_SHARED_ARENA=true USE_SHARED_REGION=true CLOSE_STDOUT_ON_RESULT=true REGION_COUNT=128"
 
 #echo "Process started at $(date +%s%N | cut -b1-13)"
 eval "exec 3< <({ $CONFIGS java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_serkan_ozal; })"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
index 0ca1fe7ee..576dd0817 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -33,8 +33,10 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.Queue;
 import java.util.TreeMap;
 import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -125,7 +127,9 @@ public static void main(String[] args) throws Exception {
                 arena = Arena.ofShared();
                 region = fc.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, arena);
             }
-            // Split whole file into regions and start region processors to handle those regions
+
+            List<Task> tasks = new ArrayList<>(regionCount);
+            // Split whole file into regions and create tasks for each region
             List<Future<Response>> futures = new ArrayList<>(regionCount);
             for (int i = 0; i < regionCount; i++) {
                 long endPos = Math.min(fileSize, startPos + regionSize);
@@ -134,11 +138,19 @@ public static void main(String[] args) throws Exception {
                 long closestLineEndPos = (i < regionCount - 1)
                         ? findClosestLineEnd(fc, endPos, lineBuffer)
                         : fileSize;
-                Request request = new Request(fc, arena, region, startPos, closestLineEndPos, result);
+                Task task = new Task(fc, region, startPos, closestLineEndPos);
+                tasks.add(task);
+                startPos = closestLineEndPos;
+            }
+
+            Queue<Task> sharedTasks = new ConcurrentLinkedQueue<>(tasks);
+
+            // Start region processors to process tasks for each region
+            for (int i = 0; i < concurrency; i++) {
+                Request request = new Request(arena, sharedTasks, result);
                 RegionProcessor regionProcessor = createRegionProcessor(request);
                 Future<Response> future = executor.submit(regionProcessor);
                 futures.add(future);
-                startPos = closestLineEndPos;
             }
 
             // Wait processors to complete
@@ -234,22 +246,14 @@ public Thread newThread(Runnable r) {
      */
     private static class RegionProcessor implements Callable<Response> {
 
-        private final FileChannel fc;
         private final Arena arena;
-        private final MemorySegment region;
-        private final long start;
-        private final long end;
-        private final long size;
+        private final Queue<Task> sharedTasks;
         private final Result result;
         private OpenMap map;
 
         private RegionProcessor(Request request) {
-            this.fc = request.fileChannel;
             this.arena = request.arena;
-            this.region = request.region;
-            this.start = request.start;
-            this.end = request.end;
-            this.size = end - start;
+            this.sharedTasks = request.sharedTasks;
             this.result = request.result;
         }
 
@@ -277,14 +281,17 @@ private void processRegion() throws Exception {
             // If no shared global memory arena is used, create and use its own local memory arena
             Arena a = arenaGiven ? arena : Arena.ofConfined();
             try {
-                boolean regionGiven = region != null;
-                MemorySegment r = regionGiven
-                        ? region
-                        : fc.map(FileChannel.MapMode.READ_ONLY, start, size, a);
-                long regionStart = regionGiven ? (r.address() + start) : r.address();
-                long regionEnd = regionStart + size;
-
-                doProcessRegion(r, r.address(), regionStart, regionEnd);
+                for (Task task = sharedTasks.poll(); task != null; task = sharedTasks.poll()) {
+                    boolean regionGiven = task.region != null;
+                    MemorySegment r = regionGiven
+                            ? task.region
+                            : task.fileChannel.map(FileChannel.MapMode.READ_ONLY, task.start, task.size, a);
+                    long regionStart = regionGiven ? (r.address() + task.start) : r.address();
+                    long regionEnd = regionStart + task.size;
+
+                    doProcessRegion(r, r.address(), regionStart, regionEnd);
+                }
+
                 if (VERBOSE) {
                     System.out.println("[Processor-" + Thread.currentThread().getName() + "] Region processed at " + System.currentTimeMillis());
                 }
@@ -358,21 +365,22 @@ private long doProcessLine(MemorySegment region, long regionAddress, long region
 
             // Vectorized search for key/value separator
             ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, region, regionPtr - regionAddress, NATIVE_BYTE_ORDER);
-            int keyValueSepOffset = keyVector.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
+            int keyLength = keyVector.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
             // Check whether key/value separator is found in the first vector (city name is <= vector size)
-            if (keyValueSepOffset == vectorSize) {
+            if (keyLength != vectorSize) {
+                regionPtr += (keyLength + 1);
+            }
+            else {
                 regionPtr += vectorSize;
-                keyValueSepOffset = 0;
                 for (; U.getByte(regionPtr) != KEY_VALUE_SEPARATOR; regionPtr++)
                     ;
+                keyLength = (int) (regionPtr - keyStartPtr);
+                regionPtr++;
                 // I have tried vectorized search for key/value separator in the remaining part,
                 // but since majority (99%) of the city names <= 16 bytes
                 // and other a few longer city names (have length < 16 and <= 32) not close to 32 bytes,
                 // byte by byte search is better in terms of performance (according to my experiments) and simplicity.
             }
-            regionPtr += keyValueSepOffset;
-            int keyLength = (int) (regionPtr - keyStartPtr);
-            regionPtr++;
             ////////////////////////////////////////////////////////////////////////////////////////////////////////
 
             // Put key and get map offset to put value
@@ -411,21 +419,32 @@ private static long extractValue(long regionPtr, OpenMap map, long entryOffset)
      */
     private static final class Request {
 
-        private final FileChannel fileChannel;
         private final Arena arena;
+        private final Queue<Task> sharedTasks;
+        private final Result result;
+
+        private Request(Arena arena, Queue<Task> sharedTasks, Result result) {
+            this.arena = arena;
+            this.sharedTasks = sharedTasks;
+            this.result = result;
+        }
+
+    }
+
+    private static final class Task {
+
+        private final FileChannel fileChannel;
         private final MemorySegment region;
         private final long start;
         private final long end;
-        private final Result result;
+        private final long size;
 
-        private Request(FileChannel fileChannel, Arena arena, MemorySegment region,
-                        long start, long end, Result result) {
+        private Task(FileChannel fileChannel, MemorySegment region, long start, long end) {
             this.fileChannel = fileChannel;
-            this.arena = arena;
             this.region = region;
             this.start = start;
             this.end = end;
-            this.result = result;
+            this.size = end - start;
         }
 
     }
@@ -550,6 +569,8 @@ private static final class OpenMap {
         // 128 bytes - total
 
         private static final int ENTRY_SIZE = 128;
+        private static final int ENTRY_SIZE_SHIFT = 7;
+
         private static final int COUNT_OFFSET = 0;
         private static final int MIN_VALUE_OFFSET = 4;
         private static final int MAX_VALUE_OFFSET = 6;
@@ -563,12 +584,14 @@ private static final class OpenMap {
         private static final int KEY_ARRAY_OFFSET = KEY_OFFSET - Unsafe.ARRAY_BYTE_BASE_OFFSET;
 
         private final byte[] data;
-        // Max number of unique keys are 10K, so 1 << 14 (16384) is long enough to hold offsets for all of them
-        private final long[] entryOffsets = new long[1 << 14];
-        private int entryOffsetIdx = 0;
+        private final long[] entryOffsets;
+        private int entryOffsetIdx;
 
         private OpenMap() {
             this.data = new byte[MAP_SIZE];
+            // Max number of unique keys are 10K, so 1 << 14 (16384) is long enough to hold offsets for all of them
+            this.entryOffsets = new long[1 << 14];
+            this.entryOffsetIdx = 0;
         }
 
         // Credits: merykitty
@@ -591,12 +614,12 @@ private long putKey(ByteVector keyVector, long keyStartAddress, int keyLength) {
             // Calculate hash of key
             int keyHash = calculateKeyHash(keyStartAddress, keyLength);
             // and get the position of the entry in the linear map based on calculated hash
-            int idx = keyHash & ENTRY_HASH_MASK;
+            int idx = (keyHash & ENTRY_HASH_MASK) << ENTRY_SIZE_SHIFT;
 
             // Start searching from the calculated position
             // and continue until find an available slot in case of hash collision
             // TODO Prevent infinite loop if all the slots are in use for other keys
-            for (long entryOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + (idx * ENTRY_SIZE);; entryOffset = (entryOffset + ENTRY_SIZE) & ENTRY_MASK) {
+            for (long entryOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + idx;; entryOffset = (entryOffset + ENTRY_SIZE) & ENTRY_MASK) {
                 int keySize = U.getInt(data, entryOffset + KEY_SIZE_OFFSET);
                 // Check whether current index is empty (no another key is inserted yet)
                 if (keySize == 0) {
@@ -624,16 +647,15 @@ private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLen
                 // Use vectorized search for the comparison of keys.
                 // Since majority of the city names >= 8 bytes and <= 16 bytes,
                 // this way is more efficient (according to my experiments) than any other comparisons (byte by byte or 2 longs).
-                int keyCheckLength = Math.min(BYTE_SPECIES_SIZE, keyLength);
                 ByteVector entryKeyVector = ByteVector.fromArray(BYTE_SPECIES, data, keyStartArrayOffset);
                 long eqMask = keyVector.compare(VectorOperators.EQ, entryKeyVector).toLong();
                 int eqCount = Long.numberOfTrailingZeros(~eqMask);
-                if (eqCount < keyCheckLength) {
-                    return false;
-                }
-                if (keyCheckLength == keyLength) {
+                if (eqCount >= keyLength) {
                     return true;
                 }
+                else if (keyLength <= BYTE_SPECIES_SIZE) {
+                    return false;
+                }
                 keyCheckIdx = BYTE_SPECIES_SIZE;
             }
 

From d9d2f3f97f4229ce327cd46ad129ffe10e3b9ba6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Adamski?= <pawel.poczta@gmail.com>
Date: Mon, 29 Jan 2024 21:35:51 +0100
Subject: [PATCH 200/268] =?UTF-8?q?Pawe=C5=82=20Adamski=20=20-=201brc=20su?=
 =?UTF-8?q?bmission=20=20(#629)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Paweł Adamski - 1BRC challenge

* Paweł Adamski - 1BRC challenge

* Make files executabe
---
 calculate_average_PawelAdamski.sh             |  19 ++
 prepare_PawelAdamski.sh                       |  20 ++
 .../onebrc/CalculateAverage_PawelAdamski.java | 209 ++++++++++++++++++
 3 files changed, 248 insertions(+)
 create mode 100755 calculate_average_PawelAdamski.sh
 create mode 100755 prepare_PawelAdamski.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_PawelAdamski.java

diff --git a/calculate_average_PawelAdamski.sh b/calculate_average_PawelAdamski.sh
new file mode 100755
index 000000000..e8d4bd4ce
--- /dev/null
+++ b/calculate_average_PawelAdamski.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="-Xnoclassgc"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_PawelAdamski
diff --git a/prepare_PawelAdamski.sh b/prepare_PawelAdamski.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_PawelAdamski.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_PawelAdamski.java b/src/main/java/dev/morling/onebrc/CalculateAverage_PawelAdamski.java
new file mode 100644
index 000000000..45470558f
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_PawelAdamski.java
@@ -0,0 +1,209 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.channels.FileChannel;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.stream.Collectors.groupingByConcurrent;
+
+public class CalculateAverage_PawelAdamski {
+
+    private static final long READ_SIZE = 100_000_000;
+    private static final String FILE = "./measurements.txt";
+
+    private static record ResultRow(double min, double mean, double max) {
+
+        public ResultRow(MeasurementAggregator ma) {
+            this(ma.min / 10.0, ((Math.round(ma.sum * 100.0) / 100.0) / (double) ma.count) / 10.0, ma.max / 10.0);
+        }
+
+        public String toString() {
+            return round(min) + "/" + round(mean) + "/" + round(max);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+    private static class Station {
+        byte[] bytes;
+        int hash;
+
+        public Station(byte[] station) {
+            this.bytes = station;
+            this.hash = Arrays.hashCode(bytes);
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            return Arrays.equals(bytes, ((Station) o).bytes);
+        }
+
+    }
+
+    private static class MeasurementAggregator {
+        private long min;
+        private long max;
+        private long sum;
+        private long count;
+
+        public MeasurementAggregator(long temp) {
+            min = temp;
+            max = temp;
+            sum = temp;
+            count = 1;
+        }
+
+        public MeasurementAggregator() {
+            min = Long.MAX_VALUE;
+            max = Long.MIN_VALUE;
+            sum = 0;
+            count = 0;
+        }
+
+        public MeasurementAggregator merge(MeasurementAggregator measurement) {
+            MeasurementAggregator ma = new MeasurementAggregator();
+            ma.min = Math.min(min, measurement.min);
+            ma.max = Math.max(max, measurement.max);
+            ma.sum = sum + measurement.sum;
+            ma.count = count + measurement.count;
+            return ma;
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+        try (RandomAccessFile raf = new RandomAccessFile(FILE, "r")) {
+            List<FilePart> parts = splitFileIntoParts(raf);
+            Map<Station, MeasurementAggregator> rr = calculateTemperatureStats(parts, raf);
+            Map<String, ResultRow> results = prepareResults(rr);
+            System.out.println(results);
+        }
+    }
+
+    private static Map<String, ResultRow> prepareResults(Map<Station, MeasurementAggregator> rr) {
+        Map<String, ResultRow> measurements = new TreeMap<>();
+        rr.forEach((k, v) -> measurements.put(new String(k.bytes, UTF_8), new ResultRow(v)));
+        return measurements;
+    }
+
+    private static Map<Station, MeasurementAggregator> calculateTemperatureStats(List<FilePart> parts, RandomAccessFile raf) {
+        return parts.parallelStream()
+                .map(filePart -> parse(filePart, raf))
+                .flatMap(m -> m.entrySet().stream())
+                .collect(groupingByConcurrent(
+                        Map.Entry::getKey,
+                        Collectors.reducing(
+                                new MeasurementAggregator(),
+                                Map.Entry::getValue,
+                                MeasurementAggregator::merge)));
+    }
+
+    private static ArrayList<FilePart> splitFileIntoParts(RandomAccessFile raf) throws IOException {
+        ArrayList<FilePart> parts = new ArrayList<>((int) (raf.length() / READ_SIZE));
+        long pointer = 0;
+        long nextPointer = 0;
+        long fileLength = raf.length();
+        while (pointer < fileLength) {
+            if (pointer + READ_SIZE > fileLength) {
+                nextPointer = fileLength;
+            }
+            else {
+                nextPointer = findNextLine(raf, pointer + READ_SIZE);
+            }
+            parts.add(new FilePart(pointer, nextPointer - pointer));
+            pointer = nextPointer;
+        }
+        return parts;
+    }
+
+    private static Map<Station, MeasurementAggregator> parse(FilePart filePart, RandomAccessFile raf) {
+        try {
+            byte[] bytes = readBytesFromFile(filePart, raf);
+            return parseBytesIntoStationsMap(bytes);
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static HashMap<Station, MeasurementAggregator> parseBytesIntoStationsMap(byte[] bytes) {
+        HashMap<Station, MeasurementAggregator> measurementAggregator = new HashMap<>(500);
+        int semicolonIndex = 0;
+        int newLineIndex = -1;
+        for (int i = 0; i < bytes.length; i++) {
+            if (bytes[i] == ';') {
+                semicolonIndex = i;
+            }
+            else if (bytes[i] == '\n') {
+                byte[] station = Arrays.copyOfRange(bytes, newLineIndex + 1, semicolonIndex);
+                long temp = parseDouble(bytes, semicolonIndex + 1, i);
+                MeasurementAggregator measurement = new MeasurementAggregator(temp);
+                measurementAggregator.compute(new Station(station), (k, prevV) -> prevV == null ? measurement : prevV.merge(measurement));
+                newLineIndex = i;
+            }
+        }
+        return measurementAggregator;
+    }
+
+    private static byte[] readBytesFromFile(FilePart filePart, RandomAccessFile raf) throws IOException {
+        var bb = raf.getChannel().map(FileChannel.MapMode.READ_ONLY, filePart.start(), filePart.len());
+        byte[] bytes = new byte[bb.remaining()];
+        bb.get(bytes);
+        return bytes;
+    }
+
+    private static long parseDouble(byte[] text, int start, int end) {
+        boolean negative = false;
+        int result = 0;
+        for (int i = start; i < end; i++) {
+            byte c = text[i];
+            if (c == '-') {
+                negative = true;
+            }
+            else if (c != '.') {
+                result *= 10;
+                result += c - '0';
+            }
+        }
+        if (negative) {
+            return -result;
+        }
+        else {
+            return result;
+        }
+    }
+
+    private static long findNextLine(RandomAccessFile raf, long currentPosition) throws IOException {
+        raf.seek(currentPosition);
+        while (raf.readByte() != '\n')
+            ;
+        return raf.getFilePointer();
+    }
+
+    record FilePart(long start, long len) {
+    }
+}

From 8fbd6325808f0f08f0caca625ae85fe5ca2b28a9 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 29 Jan 2024 21:36:45 +0100
Subject: [PATCH 201/268] Leaderboard update

---
 README.md                          | 19 ++++++++++---------
 calculate_average_giovannicuccu.sh |  0
 2 files changed, 10 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 calculate_average_giovannicuccu.sh

diff --git a/README.md b/README.md
index be2ce00a9..49e3ea072 100644
--- a/README.md
+++ b/README.md
@@ -42,20 +42,21 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:01.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 2 | 00:01.990 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.081 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
-| 3* | 00:02.091 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 2 | 00:01.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 3 | 00:01.970 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:02.081 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-|   | 00:02.440 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+|   | 00:02.205 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
+|   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
+|   | 00:02.332 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
+|   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.31-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
 |   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
-|   | 00:03.376 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
 |   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
-|   | 00:03.594 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
 |   | 00:03.824 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
@@ -63,10 +64,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
+|   | 00:04.209 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| 21.0.1-open | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
 |   | 00:04.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
-|   | 00:04.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
-|   | 00:04.719 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| java | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
@@ -136,6 +136,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
+|   | 00:15.006 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PawelAdamski.java)| java | [Paweł Adamski](https://github.com/PawelAdamski) |  |
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
 |   | 00:16.063 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java)| 21.0.1-open | [Marek Kohn](https://github.com/makohn) |  |
 |   | 00:16.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java)| 21.0.1-open | [Aleksei](https://github.com/bytesfellow) |  |
@@ -158,7 +159,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:22.334 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java)| 21.0.1-open | [Alberto Venturini](https://github.com/albertoventurini) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
 |   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
-|   | 00:24.550 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java)| 21.0.1-graal | [PanosDR](https://github.com/PanagiotisDrakatos) | GraalVM native binary |
+|   | 00:22.687 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java)| 21.0.1-graal | [Panagiotis Drakatos](https://github.com/PanagiotisDrakatos) | GraalVM native binary |
 |   | 00:24.986 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
diff --git a/calculate_average_giovannicuccu.sh b/calculate_average_giovannicuccu.sh
old mode 100644
new mode 100755

From c3510565244ede76f0ed59bf2a17ccc9a9912f96 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Mon, 29 Jan 2024 21:06:21 +0000
Subject: [PATCH 202/268] fix ordering in README.md (#649)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 49e3ea072..918ed9d11 100644
--- a/README.md
+++ b/README.md
@@ -74,9 +74,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.069 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
+|   | 00:05.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
-|   | 00:05.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
 |   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:05.709 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |

From 5b9703283a31df9815b2f379fb483f14368ffbe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <sozal@catchpoint.com>
Date: Tue, 30 Jan 2024 00:11:27 +0300
Subject: [PATCH 203/268] serkan-ozal's 5th submission: (#648)

- use region address directly over null base memory address to get rid of extra offset calculation
---
 .../morling/onebrc/CalculateAverage_serkan_ozal.java   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
index 576dd0817..0ec485619 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -59,8 +59,9 @@ public class CalculateAverage_serkan_ozal {
             ? ByteVector.SPECIES_128
             : ByteVector.SPECIES_64;
     private static final int BYTE_SPECIES_SIZE = BYTE_SPECIES.vectorByteSize();
-
+    private static final MemorySegment ALL = MemorySegment.NULL.reinterpret(Long.MAX_VALUE);
     private static final ByteOrder NATIVE_BYTE_ORDER = ByteOrder.nativeOrder();
+
     private static final char NEW_LINE_SEPARATOR = '\n';
     private static final char KEY_VALUE_SEPARATOR = ';';
     private static final int MAX_LINE_LENGTH = 128;
@@ -341,7 +342,7 @@ private void doProcessRegion(MemorySegment region, long regionAddress, long regi
 
             // Read and process region - main
             for (regionPtr = regionStart; regionPtr < regionMainLimit;) {
-                regionPtr = doProcessLine(region, regionAddress, regionPtr, vectorSize);
+                regionPtr = doProcessLine(regionPtr, vectorSize);
             }
 
             // Read and process region - tail
@@ -358,13 +359,14 @@ private void doProcessRegion(MemorySegment region, long regionAddress, long regi
             }
         }
 
-        private long doProcessLine(MemorySegment region, long regionAddress, long regionPtr, int vectorSize) {
+        private long doProcessLine(long regionPtr, int vectorSize) {
             // Find key/value separator
             ////////////////////////////////////////////////////////////////////////////////////////////////////////
             long keyStartPtr = regionPtr;
 
             // Vectorized search for key/value separator
-            ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, region, regionPtr - regionAddress, NATIVE_BYTE_ORDER);
+            ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, regionPtr, NATIVE_BYTE_ORDER);
+
             int keyLength = keyVector.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
             // Check whether key/value separator is found in the first vector (city name is <= vector size)
             if (keyLength != vectorSize) {

From 036f9a01b18ef2d35dac6bbceec46b1ccbfe4f2b Mon Sep 17 00:00:00 2001
From: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
Date: Mon, 29 Jan 2024 22:19:23 +0100
Subject: [PATCH 204/268] Clean up, fine tuning, credit section for thomaswue
 (#646)

* Some clean up, fine tuning, removing non-supported options, added credit
section and additional comments.

* Put license header year back to 2023 to pass checks.

* Remove static linking (as it requires some more setup on the target
machine).
---
 prepare_thomaswue.sh                          |   2 +-
 .../onebrc/CalculateAverage_thomaswue.java    | 269 ++++++++----------
 2 files changed, 127 insertions(+), 144 deletions(-)

diff --git a/prepare_thomaswue.sh b/prepare_thomaswue.sh
index 10dc73280..da0a5917e 100755
--- a/prepare_thomaswue.sh
+++ b/prepare_thomaswue.sh
@@ -20,7 +20,7 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_thomaswue_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:-GenLoopSafepoints -march=native --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:TuneInlinerExploration=1 -march=native --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
     # Use -H:MethodFilter=CalculateAverage_thomaswue.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_thomaswue_image dev.morling.onebrc.CalculateAverage_thomaswue
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
index c02a8813a..9b21f91aa 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
@@ -16,122 +16,68 @@
 package dev.morling.onebrc;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Path;
-import java.nio.file.StandardOpenOption;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicLong;
-import java.util.stream.IntStream;
 
 /**
- * Simple solution that memory maps the input file, then splits it into one segment per available core and uses
- * sun.misc.Unsafe to directly access the mapped memory. Uses a long at a time when checking for collision.
- * <p>
- * Runs in 0.41s on my Intel i9-13900K
- * Perf stats:
- *     25,286,227,376      cpu_core/cycles/
- *     26,833,723,225      cpu_atom/cycles/
+ * The solution starts a child worker process for the actual work such that clean up of the memory mapping can occur
+ * while the main process already returns with the result. The worker then memory maps the input file, creates a worker
+ * thread per available core, and then processes segments of size {@link #SEGMENT_SIZE} at a time. The segments are
+ * split into 3 parts and cursors for each of those parts are processing the segment simultaneously in the same thread.
+ * Results are accumulated into {@link Result} objects and a tree map is used to sequentially accumulate the results in
+ * the end.
+ *
+ * Runs in 0.40s on an Intel i9-13900K.
+ *
+ * Credit:
+ *  Quan Anh Mai for branchless number parsing code
+ *  Alfonso² Peterssen for suggesting memory mapping with unsafe and the subprocess idea
+ *  Artsiom Korzun for showing the benefits of work stealing at 2MB segments instead of equal split between workers
  */
 public class CalculateAverage_thomaswue {
     private static final String FILE = "./measurements.txt";
     private static final int MIN_TEMP = -999;
     private static final int MAX_TEMP = 999;
-
-    // Holding the current result for a single city.
-    private static class Result {
-        long lastNameLong, secondLastNameLong;
-        long min, max;
-        long sum;
-        int count;
-        long[] name;
-        String nameAsString;
-
-        private Result() {
-            this.min = MAX_TEMP;
-            this.max = MIN_TEMP;
-        }
-
-        public String toString() {
-            return round(((double) min) / 10.0) + "/" + round((((double) sum) / 10.0) / count) + "/" + round(((double) max) / 10.0);
-        }
-
-        private static double round(double value) {
-            return Math.round(value * 10.0) / 10.0;
-        }
-
-        // Accumulate another result into this one.
-        private void add(Result other) {
-            if (other.min < min) {
-                min = other.min;
-            }
-            if (other.max > max) {
-                max = other.max;
-            }
-            sum += other.sum;
-            count += other.count;
-        }
-
-        public String calcName() {
-            if (nameAsString == null) {
-                ByteBuffer bb = ByteBuffer.allocate(name.length * Long.BYTES).order(ByteOrder.nativeOrder());
-                bb.asLongBuffer().put(name);
-                byte[] array = bb.array();
-                int i = 0;
-                while (array[i++] != ';')
-                    ;
-                nameAsString = new String(array, 0, i - 1, StandardCharsets.UTF_8);
-            }
-            return nameAsString;
-        }
-    }
+    private static final int MAX_NAME_LENGTH = 100;
+    private static final int MAX_CITIES = 10000;
+    private static final int SEGMENT_SIZE = 1 << 21;
+    private static final int HASH_TABLE_SIZE = 1 << 17;
 
     public static void main(String[] args) throws IOException, InterruptedException {
+        // Start worker subprocess if this process is not the worker.
         if (args.length == 0 || !("--worker".equals(args[0]))) {
             spawnWorker();
             return;
         }
-        // Calculate input segments.
-        int numberOfWorkers = Runtime.getRuntime().availableProcessors();
-        final AtomicLong cursor = new AtomicLong();
-        final long fileEnd;
-        final long fileStart;
 
-        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+        int numberOfWorkers = Runtime.getRuntime().availableProcessors();
+        try (var fileChannel = FileChannel.open(java.nio.file.Path.of(FILE), java.nio.file.StandardOpenOption.READ)) {
             long fileSize = fileChannel.size();
-            fileStart = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
-            cursor.set(fileStart);
-            fileEnd = fileStart + fileSize;
-        }
-
-        // Parallel processing of segments.
-        Thread[] threads = new Thread[numberOfWorkers];
-        List<Result>[] allResults = new List[numberOfWorkers];
-        for (int i = 0; i < threads.length; ++i) {
-            final int index = i;
-            threads[i] = new Thread(() -> {
-                Result[] resultArray = parseLoop(cursor, fileEnd, fileStart);
-                List<Result> results = new ArrayList<>(500);
-                for (Result r : resultArray) {
-                    if (r != null) {
-                        r.calcName();
-                        results.add(r);
-                    }
-                }
-                allResults[index] = results;
-            });
-            threads[i].start();
-        }
+            final long fileStart = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, java.lang.foreign.Arena.global()).address();
+            final long fileEnd = fileStart + fileSize;
+            final AtomicLong cursor = new AtomicLong(fileStart);
+
+            // Parallel processing of segments.
+            Thread[] threads = new Thread[numberOfWorkers];
+            List<Result>[] allResults = new List[numberOfWorkers];
+            for (int i = 0; i < threads.length; ++i) {
+                final int index = i;
+                threads[i] = new Thread(() -> {
+                    List<Result> results = new ArrayList<>(MAX_CITIES);
+                    parseLoop(cursor, fileEnd, fileStart, results);
+                    allResults[index] = results;
+                });
+                threads[i].start();
+            }
+            for (Thread thread : threads) {
+                thread.join();
+            }
 
-        for (Thread thread : threads) {
-            thread.join();
+            // Final output.
+            System.out.println(accumulateResults(allResults));
+            System.out.close();
         }
-
-        // Final output.
-        System.out.println(accumulateResults(allResults));
-        System.out.close();
     }
 
     private static void spawnWorker() throws IOException {
@@ -144,31 +90,30 @@ private static void spawnWorker() throws IOException {
                 .start().getInputStream().transferTo(System.out);
     }
 
-    // Accumulate results sequentially for simplicity.
     private static TreeMap<String, Result> accumulateResults(List<Result>[] allResults) {
         TreeMap<String, Result> result = new TreeMap<>();
         for (List<Result> resultArr : allResults) {
             for (Result r : resultArr) {
-                String name = r.calcName();
-                Result current = result.putIfAbsent(name, r);
+                Result current = result.putIfAbsent(r.calcName(), r);
                 if (current != null) {
-                    current.add(r);
+                    current.accumulate(r);
                 }
             }
         }
         return result;
     }
 
-    private static Result findResult(long initialWord, long initialPos, Scanner scanner, Result[] results) {
-
+    private static Result findResult(long initialWord, long initialPos, Scanner scanner, Result[] results, List<Result> collectedResults) {
         Result existingResult;
         long word = initialWord;
         long pos = initialPos;
         long hash;
         long nameAddress = scanner.pos();
 
-        // Search for ';', one long at a time.
+        // Search for ';', one long at a time. There are two common cases that a specially treated:
+        // (b) the ';' is found in the first 16 bytes
         if (pos != 0) {
+            // Special case for when the ';' is found in the first 8 bytes.
             pos = Long.numberOfTrailingZeros(pos) >>> 3;
             scanner.add(pos);
             word = mask(word, pos);
@@ -180,11 +125,10 @@ private static Result findResult(long initialWord, long initialPos, Scanner scan
             if (existingResult != null && existingResult.lastNameLong == word) {
                 return existingResult;
             }
-            else {
-                scanner.setPos(nameAddress + pos);
-            }
+            scanner.setPos(nameAddress + pos);
         }
         else {
+            // Special case for when the ';' is found in bytes 9-16.
             scanner.add(8);
             hash = word;
             long prevWord = word;
@@ -201,11 +145,10 @@ private static Result findResult(long initialWord, long initialPos, Scanner scan
                 if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
                     return existingResult;
                 }
-                else {
-                    scanner.setPos(nameAddress + pos + 8);
-                }
+                scanner.setPos(nameAddress + pos + 8);
             }
             else {
+                // Slow-path for when the ';' could not be found in the first 16 bytes.
                 scanner.add(8);
                 hash ^= word;
                 while (true) {
@@ -234,20 +177,20 @@ private static Result findResult(long initialWord, long initialPos, Scanner scan
         outer: while (true) {
             existingResult = results[tableIndex];
             if (existingResult == null) {
-                existingResult = newEntry(results, nameAddress, tableIndex, nameLength, scanner);
+                existingResult = newEntry(results, nameAddress, tableIndex, nameLength, scanner, collectedResults);
             }
             // Check for collision.
             int i = 0;
-            long[] name = existingResult.name;
             for (; i < nameLength + 1 - 8; i += 8) {
-                if (scanner.getLongAt(i, name) != scanner.getLongAt(nameAddress + i)) {
+                if (scanner.getLongAt(existingResult.nameAddress + i) != scanner.getLongAt(nameAddress + i)) {
+                    // Collision error, try next.
                     tableIndex = (tableIndex + 31) & (results.length - 1);
                     continue outer;
                 }
             }
 
             int remainingShift = (64 - (nameLength + 1 - i) << 3);
-            if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
+            if (existingResult.lastNameLong == (scanner.getLongAt(nameAddress + i) << remainingShift)) {
                 break;
             }
             else {
@@ -258,7 +201,7 @@ private static Result findResult(long initialWord, long initialPos, Scanner scan
         return existingResult;
     }
 
-    private static long nextNL(long prev) {
+    private static long nextNewLine(long prev) {
         while (true) {
             long currentWord = Scanner.UNSAFE.getLong(prev);
             long pos = findNewLine(currentWord);
@@ -273,11 +216,9 @@ private static long nextNL(long prev) {
         return prev;
     }
 
-    private static final int SEGMENT_SIZE = 1024 * 1024 * 2;
-
     // Main parse loop.
-    private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileStart) {
-        Result[] results = new Result[1 << 17];
+    private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileStart, List<Result> collectedResults) {
+        Result[] results = new Result[HASH_TABLE_SIZE];
 
         while (true) {
             long current = counter.addAndGet(SEGMENT_SIZE) - SEGMENT_SIZE;
@@ -286,18 +227,18 @@ private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileSta
                 return results;
             }
 
-            long segmentEnd = nextNL(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
+            long segmentEnd = nextNewLine(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
             long segmentStart;
             if (current == fileStart) {
                 segmentStart = current;
             }
             else {
-                segmentStart = nextNL(current) + 1;
+                segmentStart = nextNewLine(current) + 1;
             }
 
             long dist = (segmentEnd - segmentStart) / 3;
-            long midPoint1 = nextNL(segmentStart + dist);
-            long midPoint2 = nextNL(segmentStart + dist + dist);
+            long midPoint1 = nextNewLine(segmentStart + dist);
+            long midPoint2 = nextNewLine(segmentStart + dist + dist);
 
             Scanner scanner1 = new Scanner(segmentStart, midPoint1);
             Scanner scanner2 = new Scanner(midPoint1 + 1, midPoint2);
@@ -319,9 +260,9 @@ private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileSta
                 long pos1 = findDelimiter(word1);
                 long pos2 = findDelimiter(word2);
                 long pos3 = findDelimiter(word3);
-                Result existingResult1 = findResult(word1, pos1, scanner1, results);
-                Result existingResult2 = findResult(word2, pos2, scanner2, results);
-                Result existingResult3 = findResult(word3, pos3, scanner3, results);
+                Result existingResult1 = findResult(word1, pos1, scanner1, results, collectedResults);
+                Result existingResult2 = findResult(word2, pos2, scanner2, results, collectedResults);
+                Result existingResult3 = findResult(word3, pos3, scanner3, results, collectedResults);
                 long number1 = scanNumber(scanner1);
                 long number2 = scanNumber(scanner2);
                 long number3 = scanNumber(scanner3);
@@ -333,19 +274,19 @@ private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileSta
             while (scanner1.hasNext()) {
                 long word = scanner1.getLong();
                 long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner1, results), scanNumber(scanner1));
+                record(findResult(word, pos, scanner1, results, collectedResults), scanNumber(scanner1));
             }
 
             while (scanner2.hasNext()) {
                 long word = scanner2.getLong();
                 long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner2, results), scanNumber(scanner2));
+                record(findResult(word, pos, scanner2, results, collectedResults), scanNumber(scanner2));
             }
 
             while (scanner3.hasNext()) {
                 long word = scanner3.getLong();
                 long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner3, results), scanNumber(scanner3));
+                record(findResult(word, pos, scanner3, results, collectedResults), scanNumber(scanner3));
             }
         }
     }
@@ -361,10 +302,10 @@ private static long scanNumber(Scanner scanPtr) {
 
     private static void record(Result existingResult, long number) {
         if (number < existingResult.min) {
-            existingResult.min = number;
+            existingResult.min = (short) number;
         }
         if (number > existingResult.max) {
-            existingResult.max = number;
+            existingResult.max = (short) number;
         }
         existingResult.sum += number;
         existingResult.count++;
@@ -406,31 +347,71 @@ private static long findNewLine(long word) {
         return tmp;
     }
 
-    private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner) {
+    private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner, List<Result> collectedResults) {
         Result r = new Result();
         results[hash] = r;
-        long[] name = new long[(nameLength / Long.BYTES) + 1];
-        int pos = 0;
         int i = 0;
         for (; i < nameLength + 1 - Long.BYTES; i += Long.BYTES) {
-            name[pos++] = scanner.getLongAt(nameAddress + i);
         }
-
-        if (pos > 0) {
-            r.secondLastNameLong = name[pos - 1];
+        if (nameLength + 1 > 8) {
+            r.secondLastNameLong = scanner.getLongAt(nameAddress + i - 8);
         }
-
         int remainingShift = (64 - (nameLength + 1 - i) << 3);
         long lastWord = (scanner.getLongAt(nameAddress + i) << remainingShift);
         r.lastNameLong = lastWord;
-        name[pos] = lastWord >> remainingShift;
-        r.name = name;
+        r.nameAddress = nameAddress;
+        collectedResults.add(r);
         return r;
     }
 
-    private static class Scanner {
+    private static class Result {
+        long lastNameLong, secondLastNameLong;
+        short min, max;
+        int count;
+        long sum;
+        long nameAddress;
 
+        private Result() {
+            this.min = MAX_TEMP;
+            this.max = MIN_TEMP;
+        }
+
+        public String toString() {
+            return round(((double) min) / 10.0) + "/" + round((((double) sum) / 10.0) / count) + "/" + round(((double) max) / 10.0);
+        }
+
+        private static double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        private void accumulate(Result other) {
+            if (other.min < min) {
+                min = other.min;
+            }
+            if (other.max > max) {
+                max = other.max;
+            }
+            sum += other.sum;
+            count += other.count;
+        }
+
+        public String calcName() {
+            Scanner scanner = new Scanner(nameAddress, nameAddress + MAX_NAME_LENGTH + 1);
+            int nameLength = 0;
+            while (scanner.getByteAt(nameAddress + nameLength) != ';') {
+                nameLength++;
+            }
+            byte[] array = new byte[nameLength];
+            for (int i = 0; i < nameLength; ++i) {
+                array[i] = scanner.getByteAt(nameAddress + i);
+            }
+            return new String(array, java.nio.charset.StandardCharsets.UTF_8);
+        }
+    }
+
+    private static class Scanner {
         private static final sun.misc.Unsafe UNSAFE = initUnsafe();
+        private long pos, end;
 
         private static sun.misc.Unsafe initUnsafe() {
             try {
@@ -443,8 +424,6 @@ private static sun.misc.Unsafe initUnsafe() {
             }
         }
 
-        long pos, end;
-
         public Scanner(long start, long end) {
             this.pos = start;
             this.end = end;
@@ -470,6 +449,10 @@ long getLongAt(long pos) {
             return UNSAFE.getLong(pos);
         }
 
+        byte getByteAt(long pos) {
+            return UNSAFE.getByte(pos);
+        }
+
         long getLongAt(long pos, long[] array) {
             return UNSAFE.getLong(array, pos + sun.misc.Unsafe.ARRAY_LONG_BASE_OFFSET);
         }

From 2a44f8d390af7a4d4e848e0ae792b0e1e6ae7925 Mon Sep 17 00:00:00 2001
From: Jaime Polidura <73758994+JaimePolidura@users.noreply.github.com>
Date: Mon, 29 Jan 2024 22:22:22 +0100
Subject: [PATCH 205/268] Added improvments on string copying, string
 comparation & calculation of next index in case of collision in custom map
 (#650)

* added code

* Fixed pointers bugs

* removed my own benchmark

* added comment on how I handle hash collisions

* executed mwvn clean verify

* made scripts executable & fixed rounding issues

* Fixed way of dealing with hash collisions

* changed method name sameNameBytes to isSameNameBytes

* changes script from sh to bash

* fixed chunking bug

* Fixed bug in chunking when file size is too small

* added Runtime.getRuntime().availableProcessors

* added improvemnts on string copying, calculation of next index of Map in case on collision & improved string comparing
---
 .../CalculateAverage_JaimePolidura.java       | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java
index 3980a2c38..bc9070cf8 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java
@@ -309,16 +309,14 @@ public SimpleMap(int size) {
         }
 
         public void put(long hashToPut, byte[] nameToPut, int nameLength, int valueToPut) {
-            int index = hashToIndex(hashToPut);
+            int index = toIndex(hashToPut);
 
             for (;;) {
                 Result actualEntry = entries[index];
 
                 if (actualEntry == null) {
                     byte[] nameToPutCopy = new byte[nameLength];
-                    for (int i = 0; i < nameLength; i++) {
-                        nameToPutCopy[i] = nameToPut[i];
-                    }
+                    UNSAFE.copyMemory(nameToPut, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameToPutCopy, Unsafe.ARRAY_BYTE_BASE_OFFSET, nameLength);
 
                     entries[index] = new Result(hashToPut, nameToPutCopy, nameLength, valueToPut,
                             valueToPut, valueToPut, 1);
@@ -331,14 +329,12 @@ public void put(long hashToPut, byte[] nameToPut, int nameLength, int valueToPut
                     actualEntry.sum = actualEntry.sum + valueToPut;
                     return;
                 }
-                // If the name is not the same, we try to go to the next slot
-                if (++index >= this.size) {
-                    index = 0;
-                }
+
+                index = toIndex(index + 31);
             }
         }
 
-        private int hashToIndex(long hash) {
+        private int toIndex(long hash) {
             return (int) (((hash >> 32) ^ ((int) hash)) & (this.size - 1));
         }
     }
@@ -367,8 +363,15 @@ public boolean isSameName(byte[] otherNameBytes, int otherNameLength) {
         }
 
         private boolean isSameNameBytes(byte[] otherNameBytes) {
-            for (int i = 0; i < this.nameLength; i++) {
-                if (this.name[i] != otherNameBytes[i]) {
+            for (int i = 0; i < this.nameLength; i += 8) {
+                long thisNameBytesAsLong = UNSAFE.getLong(this.name, Unsafe.ARRAY_BYTE_BASE_OFFSET + i);
+                long otherNameBytesAsLong = UNSAFE.getLong(otherNameBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET + i);
+
+                int isPositiveAsInt = (((8 - nameLength + i) >> 31) & 1) ^ 0x01;
+                int shift = ((8 - nameLength + i) * isPositiveAsInt) * 8;
+                otherNameBytesAsLong = (otherNameBytesAsLong << shift) >>> shift;
+
+                if (thisNameBytesAsLong != otherNameBytesAsLong) {
                     return false;
                 }
             }

From 7f0e51781190bcfcd4b8624352230a19737b01c8 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Mon, 29 Jan 2024 22:28:03 +0100
Subject: [PATCH 206/268] Leaderboard update

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 918ed9d11..699fd5683 100644
--- a/README.md
+++ b/README.md
@@ -41,15 +41,15 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:01.893 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 1 | 00:01.878 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 | 2 | 00:01.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
 | 3 | 00:01.970 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.081 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+|   | 00:02.188 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
 |   | 00:02.205 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:02.332 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
-|   | 00:02.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan-ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) |  |
 |   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.31-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
@@ -64,6 +64,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
+|   | 00:04.101 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:04.209 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| 21.0.1-open | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
 |   | 00:04.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
@@ -71,7 +72,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
-|   | 00:05.069 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |

From a5ce4ba77184d669e67d9633ee20c466ad167742 Mon Sep 17 00:00:00 2001
From: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
Date: Wed, 31 Jan 2024 09:34:15 +0100
Subject: [PATCH 207/268] Added comments to used flags, clean up code, final
 fine tuning. (#674)

---
 prepare_thomaswue.sh                          |  16 +-
 .../onebrc/CalculateAverage_thomaswue.java    | 234 ++++++++----------
 2 files changed, 114 insertions(+), 136 deletions(-)

diff --git a/prepare_thomaswue.sh b/prepare_thomaswue.sh
index da0a5917e..3e75233f9 100755
--- a/prepare_thomaswue.sh
+++ b/prepare_thomaswue.sh
@@ -20,7 +20,19 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_thomaswue_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:TuneInlinerExploration=1 -march=native --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
-    # Use -H:MethodFilter=CalculateAverage_thomaswue.* -H:Dump=:2 -H:PrintGraph=Network for IdealGraphVisualizer graph dumping.
+
+    # Performance tuning flags, optimization level 3, maximum inlining exploration, and compile for the architecture where the native image is generated.
+    NATIVE_IMAGE_OPTS="-O3 -H:TuneInlinerExploration=1 -march=native"
+   
+    # Need to enable preview for accessing the raw address of the foreign memory access API.
+    # Initializing the Scanner to make sure the unsafe access object is known as a non-null compile time constant.
+    NATIVE_IMAGE_OPTS="$NATIVE_IMAGE_OPTS --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_thomaswue\$Scanner"
+
+    # There is no need for garbage collection and therefore also no safepoints required.
+    NATIVE_IMAGE_OPTS="$NATIVE_IMAGE_OPTS --gc=epsilon -H:-GenLoopSafepoints"
+
+    # Uncomment the following line for outputting the compiler graph to the IdealGraphVisualizer
+    # NATIVE_IMAGE_OPTS="$NATIVE_IMAGE_OPTS -H:MethodFilter=CalculateAverage_thomaswue.* -H:Dump=:2 -H:PrintGraph=Network"
+    
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_thomaswue_image dev.morling.onebrc.CalculateAverage_thomaswue
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
index 9b21f91aa..dc4df0cc9 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
@@ -27,9 +27,7 @@
  * split into 3 parts and cursors for each of those parts are processing the segment simultaneously in the same thread.
  * Results are accumulated into {@link Result} objects and a tree map is used to sequentially accumulate the results in
  * the end.
- *
- * Runs in 0.40s on an Intel i9-13900K.
- *
+ * Runs in 0.39s on an Intel i9-13900K.
  * Credit:
  *  Quan Anh Mai for branchless number parsing code
  *  Alfonso² Peterssen for suggesting memory mapping with unsafe and the subprocess idea
@@ -103,49 +101,111 @@ private static TreeMap<String, Result> accumulateResults(List<Result>[] allResul
         return result;
     }
 
-    private static Result findResult(long initialWord, long initialPos, Scanner scanner, Result[] results, List<Result> collectedResults) {
+    private static void parseLoop(AtomicLong counter, long fileEnd, long fileStart, List<Result> collectedResults) {
+        Result[] results = new Result[HASH_TABLE_SIZE];
+        while (true) {
+            long current = counter.addAndGet(SEGMENT_SIZE) - SEGMENT_SIZE;
+            if (current >= fileEnd) {
+                return;
+            }
+
+            long segmentEnd = nextNewLine(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
+            long segmentStart;
+            if (current == fileStart) {
+                segmentStart = current;
+            }
+            else {
+                segmentStart = nextNewLine(current) + 1;
+            }
+
+            long dist = (segmentEnd - segmentStart) / 3;
+            long midPoint1 = nextNewLine(segmentStart + dist);
+            long midPoint2 = nextNewLine(segmentStart + dist + dist);
+
+            Scanner scanner1 = new Scanner(segmentStart, midPoint1);
+            Scanner scanner2 = new Scanner(midPoint1 + 1, midPoint2);
+            Scanner scanner3 = new Scanner(midPoint2 + 1, segmentEnd);
+            while (true) {
+                if (!scanner1.hasNext()) {
+                    break;
+                }
+                if (!scanner2.hasNext()) {
+                    break;
+                }
+                if (!scanner3.hasNext()) {
+                    break;
+                }
+                long word1 = scanner1.getLong();
+                long word2 = scanner2.getLong();
+                long word3 = scanner3.getLong();
+                long delimiterMask1 = findDelimiter(word1);
+                long delimiterMask2 = findDelimiter(word2);
+                long delimiterMask3 = findDelimiter(word3);
+                Result existingResult1 = findResult(word1, delimiterMask1, scanner1, results, collectedResults);
+                Result existingResult2 = findResult(word2, delimiterMask2, scanner2, results, collectedResults);
+                Result existingResult3 = findResult(word3, delimiterMask3, scanner3, results, collectedResults);
+                long number1 = scanNumber(scanner1);
+                long number2 = scanNumber(scanner2);
+                long number3 = scanNumber(scanner3);
+                record(existingResult1, number1);
+                record(existingResult2, number2);
+                record(existingResult3, number3);
+            }
+
+            while (scanner1.hasNext()) {
+                long word = scanner1.getLong();
+                long pos = findDelimiter(word);
+                record(findResult(word, pos, scanner1, results, collectedResults), scanNumber(scanner1));
+            }
+            while (scanner2.hasNext()) {
+                long word = scanner2.getLong();
+                long pos = findDelimiter(word);
+                record(findResult(word, pos, scanner2, results, collectedResults), scanNumber(scanner2));
+            }
+            while (scanner3.hasNext()) {
+                long word = scanner3.getLong();
+                long pos = findDelimiter(word);
+                record(findResult(word, pos, scanner3, results, collectedResults), scanNumber(scanner3));
+            }
+        }
+    }
+
+    private static Result findResult(long initialWord, long initialDelimiterMask, Scanner scanner, Result[] results, List<Result> collectedResults) {
         Result existingResult;
         long word = initialWord;
-        long pos = initialPos;
+        long delimiterMask = initialDelimiterMask;
         long hash;
         long nameAddress = scanner.pos();
 
         // Search for ';', one long at a time. There are two common cases that a specially treated:
         // (b) the ';' is found in the first 16 bytes
-        if (pos != 0) {
+        if (delimiterMask != 0) {
             // Special case for when the ';' is found in the first 8 bytes.
-            pos = Long.numberOfTrailingZeros(pos) >>> 3;
-            scanner.add(pos);
-            word = mask(word, pos);
+            int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
+            word = (word << (63 - trailingZeros));
+            scanner.add(trailingZeros >>> 3);
             hash = word;
-
-            int index = hashToIndex(hash, results);
-            existingResult = results[index];
-
+            existingResult = results[hashToIndex(hash, results)];
             if (existingResult != null && existingResult.lastNameLong == word) {
                 return existingResult;
             }
-            scanner.setPos(nameAddress + pos);
         }
         else {
             // Special case for when the ';' is found in bytes 9-16.
-            scanner.add(8);
             hash = word;
             long prevWord = word;
+            scanner.add(8);
             word = scanner.getLong();
-            pos = findDelimiter(word);
-            if (pos != 0) {
-                pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                scanner.add(pos);
-                word = mask(word, pos);
+            delimiterMask = findDelimiter(word);
+            if (delimiterMask != 0) {
+                int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
+                word = (word << (63 - trailingZeros));
+                scanner.add(trailingZeros >>> 3);
                 hash ^= word;
-                int index = hashToIndex(hash, results);
-                existingResult = results[index];
-
+                existingResult = results[hashToIndex(hash, results)];
                 if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
                     return existingResult;
                 }
-                scanner.setPos(nameAddress + pos + 8);
             }
             else {
                 // Slow-path for when the ';' could not be found in the first 16 bytes.
@@ -153,11 +213,11 @@ private static Result findResult(long initialWord, long initialPos, Scanner scan
                 hash ^= word;
                 while (true) {
                     word = scanner.getLong();
-                    pos = findDelimiter(word);
-                    if (pos != 0) {
-                        pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                        scanner.add(pos);
-                        word = mask(word, pos);
+                    delimiterMask = findDelimiter(word);
+                    if (delimiterMask != 0) {
+                        int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
+                        word = (word << (63 - trailingZeros));
+                        scanner.add(trailingZeros >>> 3);
                         hash ^= word;
                         break;
                     }
@@ -204,7 +264,8 @@ private static Result findResult(long initialWord, long initialPos, Scanner scan
     private static long nextNewLine(long prev) {
         while (true) {
             long currentWord = Scanner.UNSAFE.getLong(prev);
-            long pos = findNewLine(currentWord);
+            long input = currentWord ^ 0x0A0A0A0A0A0A0A0AL;
+            long pos = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
             if (pos != 0) {
                 prev += Long.numberOfTrailingZeros(pos) >>> 3;
                 break;
@@ -216,87 +277,11 @@ private static long nextNewLine(long prev) {
         return prev;
     }
 
-    // Main parse loop.
-    private static Result[] parseLoop(AtomicLong counter, long fileEnd, long fileStart, List<Result> collectedResults) {
-        Result[] results = new Result[HASH_TABLE_SIZE];
-
-        while (true) {
-            long current = counter.addAndGet(SEGMENT_SIZE) - SEGMENT_SIZE;
-
-            if (current >= fileEnd) {
-                return results;
-            }
-
-            long segmentEnd = nextNewLine(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
-            long segmentStart;
-            if (current == fileStart) {
-                segmentStart = current;
-            }
-            else {
-                segmentStart = nextNewLine(current) + 1;
-            }
-
-            long dist = (segmentEnd - segmentStart) / 3;
-            long midPoint1 = nextNewLine(segmentStart + dist);
-            long midPoint2 = nextNewLine(segmentStart + dist + dist);
-
-            Scanner scanner1 = new Scanner(segmentStart, midPoint1);
-            Scanner scanner2 = new Scanner(midPoint1 + 1, midPoint2);
-            Scanner scanner3 = new Scanner(midPoint2 + 1, segmentEnd);
-            while (true) {
-                if (!scanner1.hasNext()) {
-                    break;
-                }
-                if (!scanner2.hasNext()) {
-                    break;
-                }
-                if (!scanner3.hasNext()) {
-                    break;
-                }
-
-                long word1 = scanner1.getLong();
-                long word2 = scanner2.getLong();
-                long word3 = scanner3.getLong();
-                long pos1 = findDelimiter(word1);
-                long pos2 = findDelimiter(word2);
-                long pos3 = findDelimiter(word3);
-                Result existingResult1 = findResult(word1, pos1, scanner1, results, collectedResults);
-                Result existingResult2 = findResult(word2, pos2, scanner2, results, collectedResults);
-                Result existingResult3 = findResult(word3, pos3, scanner3, results, collectedResults);
-                long number1 = scanNumber(scanner1);
-                long number2 = scanNumber(scanner2);
-                long number3 = scanNumber(scanner3);
-                record(existingResult1, number1);
-                record(existingResult2, number2);
-                record(existingResult3, number3);
-            }
-
-            while (scanner1.hasNext()) {
-                long word = scanner1.getLong();
-                long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner1, results, collectedResults), scanNumber(scanner1));
-            }
-
-            while (scanner2.hasNext()) {
-                long word = scanner2.getLong();
-                long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner2, results, collectedResults), scanNumber(scanner2));
-            }
-
-            while (scanner3.hasNext()) {
-                long word = scanner3.getLong();
-                long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner3, results, collectedResults), scanNumber(scanner3));
-            }
-        }
-    }
-
     private static long scanNumber(Scanner scanPtr) {
-        scanPtr.add(1);
-        long numberWord = scanPtr.getLong();
-        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+        long numberWord = scanPtr.getLongAt(scanPtr.pos() + 1);
+        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000L);
         long number = convertIntoNumber(decimalSepPos, numberWord);
-        scanPtr.add((decimalSepPos >>> 3) + 3);
+        scanPtr.add((decimalSepPos >>> 3) + 4);
         return number;
     }
 
@@ -316,10 +301,6 @@ private static int hashToIndex(long hash, Result[] results) {
         return (int) (hashAsInt & (results.length - 1));
     }
 
-    private static long mask(long word, long pos) {
-        return (word << ((7 - pos) << 3));
-    }
-
     // Special method to convert a number in the ascii number into an int without branches created by Quan Anh Mai.
     private static long convertIntoNumber(int decimalSepPos, long numberWord) {
         int shift = 28 - decimalSepPos;
@@ -337,14 +318,7 @@ private static long convertIntoNumber(int decimalSepPos, long numberWord) {
 
     private static long findDelimiter(long word) {
         long input = word ^ 0x3B3B3B3B3B3B3B3BL;
-        long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
-        return tmp;
-    }
-
-    private static long findNewLine(long word) {
-        long input = word ^ 0x0A0A0A0A0A0A0A0AL;
-        long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
-        return tmp;
+        return (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
     }
 
     private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner, List<Result> collectedResults) {
@@ -357,14 +331,13 @@ private static Result newEntry(Result[] results, long nameAddress, int hash, int
             r.secondLastNameLong = scanner.getLongAt(nameAddress + i - 8);
         }
         int remainingShift = (64 - (nameLength + 1 - i) << 3);
-        long lastWord = (scanner.getLongAt(nameAddress + i) << remainingShift);
-        r.lastNameLong = lastWord;
+        r.lastNameLong = (scanner.getLongAt(nameAddress + i) << remainingShift);
         r.nameAddress = nameAddress;
         collectedResults.add(r);
         return r;
     }
 
-    private static class Result {
+    private static final class Result {
         long lastNameLong, secondLastNameLong;
         short min, max;
         int count;
@@ -409,9 +382,10 @@ public String calcName() {
         }
     }
 
-    private static class Scanner {
+    private static final class Scanner {
         private static final sun.misc.Unsafe UNSAFE = initUnsafe();
-        private long pos, end;
+        private long pos;
+        private final long end;
 
         private static sun.misc.Unsafe initUnsafe() {
             try {
@@ -452,13 +426,5 @@ long getLongAt(long pos) {
         byte getByteAt(long pos) {
             return UNSAFE.getByte(pos);
         }
-
-        long getLongAt(long pos, long[] array) {
-            return UNSAFE.getLong(array, pos + sun.misc.Unsafe.ARRAY_LONG_BASE_OFFSET);
-        }
-
-        void setPos(long l) {
-            this.pos = l;
-        }
     }
 }
\ No newline at end of file

From 974ddbae606c412b4d35ada360879d96e93d00b1 Mon Sep 17 00:00:00 2001
From: Arman Sharif <armandino@gmail.com>
Date: Wed, 31 Jan 2024 00:39:08 -0800
Subject: [PATCH 208/268] armandino: misc improvements (#673)

---
 .../onebrc/CalculateAverage_armandino.java    | 329 ++++++++++--------
 1 file changed, 186 insertions(+), 143 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java b/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
index d825e77f9..0e9125337 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java
@@ -24,15 +24,12 @@
 import java.nio.channels.FileChannel;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.Arrays;
 import java.util.Collection;
-import java.util.Objects;
+import java.util.Map;
 import java.util.TreeMap;
-import java.util.stream.Stream;
 
 import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
 import static java.nio.charset.StandardCharsets.UTF_8;
-import static java.util.stream.Collectors.toMap;
 
 public class CalculateAverage_armandino {
 
@@ -42,19 +39,59 @@ public class CalculateAverage_armandino {
     private static final int INITIAL_MAP_CAPACITY = 8192;
     private static final byte SEMICOLON = 59;
     private static final byte NL = 10;
-    private static final byte DOT = 46;
-    private static final byte MINUS = 45;
-    private static final byte ZERO_DIGIT = 48;
     private static final int PRIME = 1117;
+
+    private static final int KEY_OFFSET = 0, // 100b
+            HASH_OFFSET = 100, // int
+            KEY_LENGTH_OFFSET = 104, // short
+            MIN_OFFSET = 106, // short
+            MAX_OFFSET = 108, // short
+            COUNT_OFFSET = 110, // int
+            SUM_OFFSET = 114; // long
+
+    private static final long ENTRY_SIZE = 100 // key: offset=0
+            + 4 // keyHash: offset=100
+            + 2 // keyLength: offset=104
+            + 2 // min: 108; offset=106
+            + 2 // max: 110; offset=108
+            + 4 // count: 114; offset=110
+            + 8; // sum: 122; offset=118
+
     private static final Unsafe UNSAFE = getUnsafe();
 
     public static void main(String[] args) throws Exception {
         var channel = FileChannel.open(FILE, StandardOpenOption.READ);
 
-        var results = Arrays.stream(split(channel)).parallel()
-                .map(chunk -> new ChunkProcessor().process(chunk.start, chunk.end))
-                .flatMap(SimpleMap::stream)
-                .collect(toMap(Stats::getKey, s -> s, CalculateAverage_armandino::mergeStats, TreeMap::new));
+        Chunk[] chunks = split(channel);
+        ChunkProcessor[] processors = new ChunkProcessor[chunks.length];
+
+        for (int i = 0; i < processors.length; i++) {
+            processors[i] = new ChunkProcessor(chunks[i].start, chunks[i].end);
+            processors[i].start();
+        }
+
+        Map<String, Stats> results = new TreeMap<>();
+
+        for (int i = 0; i < processors.length; i++) {
+            processors[i].join();
+            final long end = processors[i].map.mapEnd;
+
+            for (long addr = processors[i].map.mapStart; addr < end; addr += ENTRY_SIZE) {
+                final short keyLength = UNSAFE.getShort(addr + KEY_LENGTH_OFFSET);
+
+                if (keyLength == 0)
+                    continue;
+
+                final byte[] keyBytes = new byte[keyLength];
+                UNSAFE.copyMemory(null, addr, keyBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, keyLength);
+                final short min = UNSAFE.getShort(addr + MIN_OFFSET);
+                final short max = UNSAFE.getShort(addr + MAX_OFFSET);
+                final int count = UNSAFE.getInt(addr + COUNT_OFFSET);
+                final long sum = UNSAFE.getLong(addr + SUM_OFFSET);
+                final Stats s = new Stats(new String(keyBytes, 0, keyLength, UTF_8), min, max, count, sum);
+                results.merge(s.key, s, CalculateAverage_armandino::mergeStats);
+            }
+        }
 
         print(results.values());
     }
@@ -67,87 +104,69 @@ private static Stats mergeStats(final Stats x, final Stats y) {
         return x;
     }
 
-    private static class ChunkProcessor {
-        private final SimpleMap map = new SimpleMap(INITIAL_MAP_CAPACITY);
+    private static class ChunkProcessor extends Thread {
+        private final UnsafeMap map = new UnsafeMap(INITIAL_MAP_CAPACITY);
+
+        final long chunkStart;
+        final long chunkEnd;
 
-        private SimpleMap process(final long chunkStart, final long chunkEnd) {
+        private ChunkProcessor(long chunkStart, long chunkEnd) {
+            this.chunkStart = chunkStart;
+            this.chunkEnd = chunkEnd;
+        }
+
+        @Override
+        public void run() {
             long i = chunkStart;
             while (i < chunkEnd) {
                 final long keyAddress = i;
                 int keyHash = 0;
-                int measurement = 0;
                 byte b;
 
                 while ((b = UNSAFE.getByte(i++)) != SEMICOLON) {
                     keyHash = PRIME * keyHash + b;
                 }
 
-                final int keyLength = (int) (i - keyAddress - 1);
+                final short keyLength = (short) (i - keyAddress - 1);
+                final long numberWord = UNSAFE.getLong(i);
+                final int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+                final short measurement = parseNumber(decimalSepPos, numberWord);
+                final int addOffset = (decimalSepPos >>> 3) + 3;
+                i += addOffset;
 
-                if ((b = UNSAFE.getByte(i++)) == MINUS) {
-                    while ((b = UNSAFE.getByte(i++)) != DOT) {
-                        measurement = measurement * 10 + b - ZERO_DIGIT;
-                    }
-
-                    b = UNSAFE.getByte(i);
-                    measurement = measurement * 10 + b - ZERO_DIGIT;
-                    measurement = -measurement;
-                    i += 2;
-                }
-                else {
-                    measurement = b - ZERO_DIGIT; // D1
-                    b = UNSAFE.getByte(i); // dot or D2
-
-                    if (b == DOT) {
-                        measurement = measurement * 10 + UNSAFE.getByte(i + 1) - ZERO_DIGIT; // F
-                        i += 3;
-                    }
-                    else {
-                        measurement = measurement * 10 + b - ZERO_DIGIT; // D2
-                        measurement = measurement * 10 + UNSAFE.getByte(i + 2) - ZERO_DIGIT; // F
-                        i += 4; // skip NL
-                    }
-                }
-
-                final Stats stats = map.putStats(keyHash, keyAddress, keyLength);
-                stats.min = Math.min(stats.min, measurement);
-                stats.max = Math.max(stats.max, measurement);
-                stats.sum += measurement;
-                stats.count++;
+                map.addEntry(keyHash, keyAddress, keyLength, measurement);
             }
+        }
 
-            return map;
+        // credit: merykitty
+        private static short parseNumber(int decimalSepPos, long numberWord) {
+            int shift = 28 - decimalSepPos;
+            // signed is -1 if negative, 0 otherwise
+            long signed = (~numberWord << 59) >> 63;
+            long designMask = ~(signed & 0xFF);
+            // Align the number to a specific position and transform the ascii to digit value
+            long digits = ((numberWord & designMask) << shift) & 0x0F000F0F00L;
+            // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+            // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+            // 0x000000UU00TTHH00 + 0x00UU00TTHH000000 * 10 + 0xUU00TTHH00000000 * 100
+            long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            return (short) ((absValue ^ signed) - signed);
         }
     }
 
-    private static class Stats implements Comparable<Stats> {
-        private String key;
-        private final long keyAddress;
-        private final int keyLength;
-        private final int keyHash;
-        private int min = Integer.MAX_VALUE;
-        private int max = Integer.MIN_VALUE;
+    private static class Stats {
+        private final String key;
+        private int min;
+        private int max;
         private int count;
         private long sum;
 
-        private Stats(long keyAddress, int keyLength, int keyHash) {
-            this.keyAddress = keyAddress;
-            this.keyLength = keyLength;
-            this.keyHash = keyHash;
-        }
-
-        String getKey() {
-            if (key == null) {
-                var keyBytes = new byte[keyLength];
-                UNSAFE.copyMemory(null, keyAddress, keyBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, keyLength);
-                key = new String(keyBytes, 0, keyLength, UTF_8);
-            }
-            return key;
-        }
-
-        @Override
-        public int compareTo(final Stats o) {
-            return getKey().compareTo(o.getKey());
+        Stats(final String key, final int min, final int max, final int count, final long sum) {
+            this.min = min;
+            this.max = max;
+            this.count = count;
+            this.sum = sum;
+            this.key = key;
         }
 
         void print(final PrintStream out) {
@@ -219,90 +238,114 @@ private static Unsafe getUnsafe() {
         }
     }
 
-    private static class SimpleMap {
-        private Stats[] table;
+    private static class UnsafeMap {
 
-        SimpleMap(int initialCapacity) {
-            table = new Stats[initialCapacity];
-        }
+        long mapStart;
+        long mapEnd;
+        int capacity; // num entries
 
-        Stream<Stats> stream() {
-            return Arrays.stream(table).filter(Objects::nonNull);
+        UnsafeMap(int numEntries) {
+            capacity = numEntries;
+            final long size = ENTRY_SIZE * numEntries;
+            mapStart = UNSAFE.allocateMemory(size);
+            mapEnd = mapStart + size;
+            UNSAFE.setMemory(mapStart, size, (byte) 0);
         }
 
-        Stats putStats(final int keyHash, final long keyAddress, final int keyLength) {
-            final int pos = (table.length - 1) & keyHash;
-
-            Stats stats = table[pos];
-            if (stats == null)
-                return createAt(table, keyAddress, keyLength, keyHash, pos);
-            if (stats.keyHash == keyHash && keysEqual(stats, keyAddress, keyLength))
-                return stats;
-
-            int i = pos;
-            while (++i < table.length) {
-                stats = table[i];
-                if (stats == null)
-                    return createAt(table, keyAddress, keyLength, keyHash, i);
-                if (keyHash == stats.keyHash && keysEqual(stats, keyAddress, keyLength))
-                    return stats;
+        void addEntry(final int keyHash, final long keyAddress, final short keyLength, final short measurement) {
+            final int pos = (capacity - 1) & keyHash;
+
+            long addr = mapStart + pos * ENTRY_SIZE;
+            int hash = UNSAFE.getInt(addr + HASH_OFFSET);
+
+            if (hash == 0) { // new entry
+                initEntry(addr, keyAddress, keyLength, measurement, keyHash);
+                return;
+            }
+            if (hash == keyHash && keysEqual(addr, keyAddress, keyLength)) {
+                updateEntry(addr, measurement);
+                return;
+            }
+
+            // this can be improved to avoid clustering at the start.
+            // should only affect the 10k test
+            addr = mapStart;
+
+            while (addr < mapEnd) {
+                addr += ENTRY_SIZE;
+                hash = UNSAFE.getInt(addr + HASH_OFFSET);
+
+                if (hash == 0) {
+                    initEntry(addr, keyAddress, keyLength, measurement, keyHash);
+                    return;
+                }
+                if (hash == keyHash && keysEqual(addr, keyAddress, keyLength)) {
+                    updateEntry(addr, measurement);
+                    return;
+                }
             }
 
-            i = pos;
-            while (i-- > 0) {
-                stats = table[i];
-                if (stats == null)
-                    return createAt(table, keyAddress, keyLength, keyHash, i);
-                if (keyHash == stats.keyHash && keysEqual(stats, keyAddress, keyLength))
-                    return stats;
+            resize(keyHash, keyAddress, keyLength, measurement);
+        }
+
+        private void resize(final int keyHash, final long keyAddress, final short keyLength, final short measurement) {
+            UnsafeMap newMap = new UnsafeMap(capacity * 2);
+
+            for (long addr = mapStart; addr < mapEnd; addr += ENTRY_SIZE) {
+                final short oKeyLength = UNSAFE.getShort(addr + KEY_LENGTH_OFFSET);
+                final int oKeyHsh = UNSAFE.getInt(addr + HASH_OFFSET);
+                final short oMin = UNSAFE.getShort(addr + MIN_OFFSET);
+                final short oMax = UNSAFE.getShort(addr + MAX_OFFSET);
+                final int oCount = UNSAFE.getInt(addr + COUNT_OFFSET);
+                final long oSum = UNSAFE.getLong(addr + SUM_OFFSET);
+
+                final int newPos = (newMap.capacity - 1) & oKeyHsh;
+                long newAddr = newMap.mapStart + newPos * ENTRY_SIZE;
+
+                UNSAFE.putShort(newAddr + KEY_LENGTH_OFFSET, oKeyLength);
+                UNSAFE.putInt(newAddr + HASH_OFFSET, oKeyHsh);
+                UNSAFE.putShort(newAddr + MIN_OFFSET, oMin);
+                UNSAFE.putShort(newAddr + MAX_OFFSET, oMax);
+                UNSAFE.putInt(newAddr + COUNT_OFFSET, oCount);
+                UNSAFE.putLong(newAddr + SUM_OFFSET, oSum);
             }
-            resize();
-            return putStats(keyHash, keyAddress, keyLength);
+
+            newMap.addEntry(keyHash, keyAddress, keyLength, measurement);
+
+            this.mapStart = newMap.mapStart;
+            this.mapEnd = newMap.mapEnd;
+            this.capacity = newMap.capacity;
         }
 
-        private static Stats createAt(Stats[] table, long keyAddress, int keyLength, int key, int i) {
-            Stats stats = new Stats(keyAddress, keyLength, key);
-            table[i] = stats;
-            return stats;
+        private static void initEntry(final long entry, final long keyAddress, final short keyLength, final short measurement, final int keyHash) {
+            UNSAFE.copyMemory(keyAddress, entry, keyLength);
+            UNSAFE.putInt(entry + HASH_OFFSET, keyHash);
+            UNSAFE.putShort(entry + KEY_LENGTH_OFFSET, keyLength);
+            UNSAFE.putShort(entry + MIN_OFFSET, Short.MAX_VALUE);
+            UNSAFE.putShort(entry + MAX_OFFSET, Short.MIN_VALUE);
+
+            updateEntry(entry, measurement);
         }
 
-        private static boolean keysEqual(Stats stats, long keyAddress, final int keyLength) {
-            // credit: abeobk
-            long xsum = 0;
-            int n = keyLength & 0xF8;
-            for (int i = 0; i < n; i += 8) {
-                xsum |= (UNSAFE.getLong(stats.keyAddress + i) ^ UNSAFE.getLong(keyAddress + i));
-            }
-            return xsum == 0;
+        private static void updateEntry(final long entry, final short measurement) {
+            UNSAFE.putShort(entry + MIN_OFFSET,
+                    (short) Math.min(UNSAFE.getShort(entry + MIN_OFFSET), measurement));
+            UNSAFE.putShort(entry + MAX_OFFSET,
+                    (short) Math.max(UNSAFE.getShort(entry + MAX_OFFSET), measurement));
+            UNSAFE.putInt(entry + COUNT_OFFSET,
+                    UNSAFE.getInt(entry + COUNT_OFFSET) + 1);
+            UNSAFE.putLong(entry + SUM_OFFSET,
+                    UNSAFE.getLong(entry + SUM_OFFSET) + measurement);
         }
+    }
 
-        private void resize() {
-            var copy = new SimpleMap(table.length * 2);
-            for (Stats s : table) {
-                if (s != null) {
-                    final int pos = (copy.table.length - 1) & s.keyHash;
-                    int i = pos;
-                    if (copy.table[i] == null) {
-                        copy.table[i] = s;
-                        continue;
-                    }
-                    while (i < copy.table.length && copy.table[i] != null) {
-                        i++;
-                    }
-                    if (i == copy.table.length) {
-                        i = pos;
-                        while (i >= 0 && copy.table[i] != null) {
-                            i--;
-                        }
-                    }
-                    if (i < 0) {
-                        // if we reach here it's a bug!
-                        throw new IllegalStateException("table is full");
-                    }
-                    copy.table[i] = s;
-                }
-            }
-            table = copy.table;
+    private static boolean keysEqual(long key1Address, long key2Address, final int keyLength) {
+        // credit: abeobk
+        long xsum = 0;
+        int n = keyLength & 0xF8;
+        for (int i = 0; i < n; i += 8) {
+            xsum |= (UNSAFE.getLong(key1Address + i) ^ UNSAFE.getLong(key2Address + i));
         }
+        return xsum == 0;
     }
 }

From af2b5517c894347d42e8382b4b7559bdd9a7d337 Mon Sep 17 00:00:00 2001
From: Anita SV <anitasvasu@gmail.com>
Date: Wed, 31 Jan 2024 00:41:33 -0800
Subject: [PATCH 209/268] anitasv 3.8s vs 3m 19s : Improved using custom
 hashmap.  (#672)

* Some optimizations while staying safe

* bug fix not caught on tests
---
 .../onebrc/CalculateAverage_anitasv.java      | 163 +++++++++++++-----
 1 file changed, 118 insertions(+), 45 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java
index c15250d99..7d3d6af7b 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java
@@ -25,7 +25,6 @@
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.*;
-import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
 public class CalculateAverage_anitasv {
@@ -44,14 +43,14 @@ long indexOf(long position, byte ch) {
                     .asByteBuffer();
             while (buf.hasRemaining()) {
                 if (buf.get() == ch) {
-                    return position + buf.position() - 1;
+                    return position + (buf.position() - 1);
                 }
             }
             return -1;
         }
 
-        byte[] getRange(long start, long end) {
-            return mmapMemory.asSlice(start, end - start).toArray(ValueLayout.JAVA_BYTE);
+        MemorySegment getRange(long start, long end) {
+            return mmapMemory.asSlice(start, end - start);
         }
 
         int parseDouble(long start, long end) {
@@ -86,22 +85,122 @@ public int computeHash(long position, long stationEnd) {
             return buf2.hashCode();
         }
 
-        public boolean matches(byte[] existingStation, long start, long end) {
-            ByteBuffer buf1 = ByteBuffer.wrap(existingStation);
-            ByteBuffer buf2 = mmapMemory.asSlice(start, end - start).asByteBuffer();
-            return buf1.equals(buf2);
+        public long truncate(long index) {
+            return Math.min(index, mmapMemory.byteSize());
+        }
+
+        public long getLong(long position) {
+            return mmapMemory.get(ValueLayout.JAVA_LONG_UNALIGNED, position);
         }
     }
 
-    private record ResultRow(byte[] station, IntSummaryStatistics statistics) {
+    private record ResultRow(IntSummaryStatistics statistics, int keyLength, int next) {
+    }
+
+    private static class FastHashMap {
+        private final byte[] keys;
+        private final ResultRow[] values;
+
+        private final int capacityMinusOne;
 
-        public String toString() {
-            return STR."\{new String(station, StandardCharsets.UTF_8)} : \{statToString(statistics)}";
+        private final MemorySegment keySegment;
+
+        private int next = -1;
+
+        private FastHashMap(int capacity) {
+            this.capacityMinusOne = capacity - 1;
+            this.keys = new byte[capacity << 7];
+            this.keySegment = MemorySegment.ofArray(keys);
+            this.values = new ResultRow[capacity];
         }
+
+        IntSummaryStatistics find(int hash, Shard shard, long stationStart, long stationEnd) {
+            int initialIndex = hash & capacityMinusOne;
+            int lookupLength = (int) (stationEnd - stationStart);
+            int lookupAligned = ((lookupLength + 7) & (-8));
+            int i = initialIndex;
+
+            lookupAligned = (int) (shard.truncate(stationStart + lookupAligned) - stationStart) - 7;
+
+            do {
+                int keyIndex = i << 7;
+
+                if (keys[keyIndex] != 0 && keys[keyIndex + lookupLength] == 0) {
+
+                    int mismatch = -1, j;
+                    for (j = 0; j < lookupAligned; j += 8) {
+                        long entryLong = keySegment.get(ValueLayout.JAVA_LONG_UNALIGNED, keyIndex + j);
+                        long lookupLong = shard.getLong(stationStart + j);
+                        if (entryLong != lookupLong) {
+                            int diff = Long.numberOfTrailingZeros(entryLong ^ lookupLong);
+                            mismatch = j + (diff >> 3);
+                            break;
+                        }
+                    }
+                    if (mismatch == -1) {
+                        for (; j < lookupLength; j++) {
+                            byte entryByte = keys[keyIndex + j];
+                            byte lookupByte = shard.getByte(stationStart + j);
+                            if (entryByte != lookupByte) {
+                                mismatch = j;
+                                break;
+                            }
+                        }
+                    }
+                    if (mismatch == -1 || mismatch >= lookupLength) {
+                        return this.values[i].statistics;
+                    }
+                }
+                if (keys[keyIndex] == 0) {
+                    MemorySegment fullLookup = shard.getRange(stationStart, stationEnd);
+
+                    keySegment.asSlice(keyIndex, lookupLength)
+                            .copyFrom(fullLookup);
+
+                    keys[keyIndex + lookupLength] = 0;
+                    IntSummaryStatistics stats = new IntSummaryStatistics();
+                    ResultRow resultRow = new ResultRow(stats, lookupLength, this.next);
+                    this.next = i;
+                    this.values[i] = resultRow;
+                    return stats;
+                }
+
+                if (i == capacityMinusOne) {
+                    i = 0;
+                }
+                else {
+                    i++;
+                }
+            } while (i != initialIndex);
+            throw new IllegalStateException("Hash size too small");
+        }
+
+        Iterable<Map.Entry<String, IntSummaryStatistics>> values() {
+            return () -> new Iterator<>() {
+
+                int scan = FastHashMap.this.next;
+
+                @Override
+                public boolean hasNext() {
+                    return scan != -1;
+                }
+
+                @Override
+                public Map.Entry<String, IntSummaryStatistics> next() {
+                    ResultRow resultRow = values[scan];
+                    IntSummaryStatistics stats = resultRow.statistics;
+                    String key = new String(keys, scan << 7, resultRow.keyLength,
+                            StandardCharsets.UTF_8);
+                    scan = resultRow.next;
+                    return new AbstractMap.SimpleEntry<>(key, stats);
+                }
+            };
+        }
+
     }
 
-    private static Map<String, IntSummaryStatistics> process(Shard shard) {
-        HashMap<Integer, List<ResultRow>> result = new HashMap<>(1 << 14);
+    private static Iterable<Map.Entry<String, IntSummaryStatistics>> process(Shard shard) {
+        FastHashMap result = new FastHashMap(1 << 14);
 
         boolean skip = shard.chunkStart != 0;
         for (long position = shard.chunkStart; position < shard.chunkEnd; position++) {
@@ -116,45 +215,19 @@ private static Map<String, IntSummaryStatistics> process(Shard shard) {
                 long temperatureEnd = shard.indexOf(stationEnd + 1, (byte) '\n');
                 int temperature = shard.parseDouble(stationEnd + 1, temperatureEnd);
 
-                List<ResultRow> collisions = result.get(hash);
-                if (collisions == null) {
-                    collisions = new ArrayList<>();
-                    result.put(hash, collisions);
-                }
-
-                boolean found = false;
-                for (ResultRow existing : collisions) {
-                    byte[] existingStation = existing.station();
-                    if (shard.matches(existingStation, position, stationEnd)) {
-                        existing.statistics.accept(temperature);
-                        found = true;
-                        break;
-                    }
-                }
-                if (!found) {
-                    IntSummaryStatistics stats = new IntSummaryStatistics();
-                    stats.accept(temperature);
-                    ResultRow rr = new ResultRow(shard.getRange(position, stationEnd), stats);
-                    collisions.add(rr);
-                }
+                IntSummaryStatistics stats = result.find(hash, shard, position, stationEnd);
+                stats.accept(temperature);
                 position = temperatureEnd;
             }
         }
 
-        return result.values()
-                .stream()
-                .flatMap(Collection::stream)
-                .map(rr -> new AbstractMap.SimpleImmutableEntry<>(
-                        new String(rr.station, StandardCharsets.UTF_8),
-                        rr.statistics))
-                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+        return result.values();
     }
 
-    private static Map<String, IntSummaryStatistics> combineResults(List<Map<String, IntSummaryStatistics>> list) {
-
+    private static Map<String, IntSummaryStatistics> combineResults(List<Iterable<Map.Entry<String, IntSummaryStatistics>>> list) {
         Map<String, IntSummaryStatistics> output = HashMap.newHashMap(1024);
-        for (Map<String, IntSummaryStatistics> map : list) {
-            for (Map.Entry<String, IntSummaryStatistics> entry : map.entrySet()) {
+        for (Iterable<Map.Entry<String, IntSummaryStatistics>> map : list) {
+            for (Map.Entry<String, IntSummaryStatistics> entry : map) {
                 output.compute(entry.getKey(), (ignore, val) -> {
                     if (val == null) {
                         return entry.getValue();

From 1a4ac0d2496e9329534370eaf01b28e77658b073 Mon Sep 17 00:00:00 2001
From: Quan Anh Mai <49088128+merykitty@users.noreply.github.com>
Date: Wed, 31 Jan 2024 16:47:48 +0800
Subject: [PATCH 210/268] Final submission (#669)

* more efficient max, min

* optimize pipeline

* apply parallel to both submissions

* fix bug
---
 .../onebrc/CalculateAverage_merykitty.java    | 83 +++++++++++++-----
 .../CalculateAverage_merykittyunsafe.java     | 85 ++++++++++++++-----
 2 files changed, 125 insertions(+), 43 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java b/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java
index 1f5acf376..502002f09 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java
@@ -75,8 +75,12 @@ private static class PoorManMap {
         }
 
         void observe(Aggregator node, long value) {
-            node.min = Math.min(node.min, value);
-            node.max = Math.max(node.max, value);
+            if (node.min > value) {
+                node.min = value;
+            }
+            if (node.max < value) {
+                node.max = value;
+            }
             node.sum += value;
             node.count++;
         }
@@ -109,7 +113,7 @@ Aggregator insertInto(int bucket, MemorySegment data, long offset, int size) {
             var node = new Aggregator();
             node.keySize = size;
             this.nodes[bucket] = node;
-            MemorySegment.copy(data, offset, MemorySegment.ofArray(this.keyData), (long) bucket * KEY_SIZE, size);
+            MemorySegment.copy(data, offset, MemorySegment.ofArray(this.keyData), (long) bucket * KEY_SIZE, size + 1);
             return node;
         }
 
@@ -222,11 +226,12 @@ private static long iterate(PoorManMap aggrMap, MemorySegment data, long offset)
         var line = ByteVector.fromMemorySegment(BYTE_SPECIES, data, offset, ByteOrder.nativeOrder());
 
         // Find the delimiter ';'
-        int keySize = line.compare(VectorOperators.EQ, ';').firstTrue();
+        long semicolons = line.compare(VectorOperators.EQ, ';').toLong();
 
         // If we cannot find the delimiter in the vector, that means the key is
         // longer than the vector, fall back to scalar processing
-        if (keySize == BYTE_SPECIES.vectorByteSize()) {
+        if (semicolons == 0) {
+            int keySize = BYTE_SPECIES.length();
             while (data.get(ValueLayout.JAVA_BYTE, offset + keySize) != ';') {
                 keySize++;
             }
@@ -235,6 +240,7 @@ private static long iterate(PoorManMap aggrMap, MemorySegment data, long offset)
         }
 
         // We inline the searching of the value in the hash map
+        int keySize = Long.numberOfTrailingZeros(semicolons);
         int x;
         int y;
         if (keySize >= Integer.BYTES) {
@@ -260,7 +266,7 @@ private static long iterate(PoorManMap aggrMap, MemorySegment data, long offset)
 
             var nodeKey = ByteVector.fromArray(BYTE_SPECIES, aggrMap.keyData, bucket * PoorManMap.KEY_SIZE);
             long eqMask = line.compare(VectorOperators.EQ, nodeKey).toLong();
-            long validMask = -1L >>> -keySize;
+            long validMask = semicolons ^ (semicolons - 1);
             if ((eqMask & validMask) == validMask) {
                 break;
             }
@@ -269,28 +275,63 @@ private static long iterate(PoorManMap aggrMap, MemorySegment data, long offset)
         return parseDataPoint(aggrMap, node, data, offset + keySize + 1);
     }
 
-    // Process all lines that start in [offset, limit)
-    private static PoorManMap processFile(MemorySegment data, long offset, long limit) {
-        var aggrMap = new PoorManMap();
-        // Find the start of a new line
-        if (offset != 0) {
-            offset--;
-            while (offset < limit) {
-                if (data.get(ValueLayout.JAVA_BYTE, offset++) == '\n') {
-                    break;
-                }
+    private static long findOffset(MemorySegment data, long offset, long limit) {
+        if (offset == 0) {
+            return offset;
+        }
+
+        offset--;
+        while (offset < limit) {
+            if (data.get(ValueLayout.JAVA_BYTE, offset++) == '\n') {
+                break;
             }
         }
+        return offset;
+    }
 
-        // If there is no line starting in this segment, just return
+    // Process all lines that start in [offset, limit)
+    private static PoorManMap processFile(MemorySegment data, long offset, long limit) {
+        var aggrMap = new PoorManMap();
         if (offset == limit) {
             return aggrMap;
         }
+        int batches = 2;
+        long batchSize = Math.ceilDiv(limit - offset, batches);
+        long offset0 = offset;
+        long offset1 = offset + batchSize;
+        long limit0 = Math.min(offset1, limit);
+        long limit1 = limit;
 
-        // The main loop, optimized for speed
-        while (offset < limit - Math.max(BYTE_SPECIES.vectorByteSize(),
-                Long.BYTES + 1 + KEY_MAX_SIZE)) {
-            offset = iterate(aggrMap, data, offset);
+        // Find the start of a new line
+        offset0 = findOffset(data, offset0, limit0);
+        offset1 = findOffset(data, offset1, limit1);
+
+        long mainLoopMinWidth = Math.max(BYTE_SPECIES.vectorByteSize(), KEY_MAX_SIZE + 1 + Long.BYTES);
+        if (limit1 - offset1 < mainLoopMinWidth) {
+            offset = findOffset(data, offset, limit);
+            while (offset < limit - mainLoopMinWidth) {
+                offset = iterate(aggrMap, data, offset);
+            }
+        }
+        else {
+            while (true) {
+                boolean finish = false;
+                if (offset0 < limit0) {
+                    offset0 = iterate(aggrMap, data, offset0);
+                }
+                else {
+                    finish = true;
+                }
+                if (offset1 < limit1 - mainLoopMinWidth) {
+                    offset1 = iterate(aggrMap, data, offset1);
+                }
+                else {
+                    if (finish) {
+                        break;
+                    }
+                }
+            }
+            offset = offset1;
         }
 
         // Now we are at the tail, just be simple
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java b/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java
index 498369410..4b1fc0ddf 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java
@@ -91,10 +91,12 @@ private static class PoorManMap {
 
         void observe(long entryOffset, long value) {
             long baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + entryOffset;
-            UNSAFE.putShort(this.data, baseOffset + MIN_OFFSET,
-                    (short) Math.min(value, UNSAFE.getShort(this.data, baseOffset + MIN_OFFSET)));
-            UNSAFE.putShort(this.data, baseOffset + MAX_OFFSET,
-                    (short) Math.max(value, UNSAFE.getShort(this.data, baseOffset + MAX_OFFSET)));
+            if (UNSAFE.getShort(this.data, baseOffset + MIN_OFFSET) > value) {
+                UNSAFE.putShort(this.data, baseOffset + MIN_OFFSET, (short) value);
+            }
+            if (UNSAFE.getShort(this.data, baseOffset + MAX_OFFSET) < value) {
+                UNSAFE.putShort(this.data, baseOffset + MAX_OFFSET, (short) value);
+            }
             UNSAFE.putLong(this.data, baseOffset + SUM_OFFSET,
                     value + UNSAFE.getLong(this.data, baseOffset + SUM_OFFSET));
             UNSAFE.putLong(this.data, baseOffset + COUNT_OFFSET,
@@ -307,31 +309,70 @@ private static long iterate(PoorManMap aggrMap, long address) {
         return parseDataPoint(aggrMap, entryOffset, address + keySize + 1);
     }
 
+    private static long findOffset(long base, long offset, long limit) {
+        if (offset == 0) {
+            return offset;
+        }
+
+        offset--;
+        while (offset < limit) {
+            if (UNSAFE.getByte(base + (offset++)) == '\n') {
+                break;
+            }
+        }
+        return offset;
+    }
+
     // Process all lines that start in [offset, limit)
     private static PoorManMap processFile(MemorySegment data, long offset, long limit) {
         var aggrMap = new PoorManMap();
+        if (offset == limit) {
+            return aggrMap;
+        }
         long base = data.address();
-        long begin = base + offset;
-        long end = base + limit;
+        int batches = 2;
+        long batchSize = Math.ceilDiv(limit - offset, batches);
+        long offset0 = offset;
+        long offset1 = offset + batchSize;
+        long limit0 = Math.min(offset1, limit);
+        long limit1 = limit;
+
         // Find the start of a new line
-        if (offset != 0) {
-            begin--;
-            while (begin < end) {
-                if (UNSAFE.getByte(begin++) == '\n') {
-                    break;
-                }
-            }
-        }
+        offset0 = findOffset(base, offset0, limit0);
+        offset1 = findOffset(base, offset1, limit1);
 
-        // If there is no line starting in this segment, just return
-        if (begin == end) {
-            return aggrMap;
+        long begin;
+        long end = base + limit;
+        long mainLoopMinWidth = Math.max(BYTE_SPECIES.vectorByteSize(), KEY_MAX_SIZE + 1 + Long.BYTES);
+        if (limit1 - offset1 < mainLoopMinWidth) {
+            begin = base + findOffset(base, offset, limit);
+            while (begin < end - mainLoopMinWidth) {
+                begin = iterate(aggrMap, begin);
+            }
         }
-
-        // The main loop, optimized for speed
-        while (begin < end - Math.max(BYTE_SPECIES.vectorByteSize(),
-                Long.BYTES + 1 + KEY_MAX_SIZE)) {
-            begin = iterate(aggrMap, begin);
+        else {
+            long begin0 = base + offset0;
+            long begin1 = base + offset1;
+            long end0 = base + limit0;
+            long end1 = base + limit1;
+            while (true) {
+                boolean finish = false;
+                if (begin0 < end0) {
+                    begin0 = iterate(aggrMap, begin0);
+                }
+                else {
+                    finish = true;
+                }
+                if (begin1 < end1 - mainLoopMinWidth) {
+                    begin1 = iterate(aggrMap, begin1);
+                }
+                else {
+                    if (finish) {
+                        break;
+                    }
+                }
+            }
+            begin = begin1;
         }
 
         // Now we are at the tail, just be simple

From f6aa09926c1f4cf30c47737f3fa37c56df89ea11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <sozal@catchpoint.com>
Date: Wed, 31 Jan 2024 11:56:11 +0300
Subject: [PATCH 211/268] serkan-ozal's 6th submission: (#667)

- process multiple lines at a time to get the benefit of ILP (Instruction Level Parallelism) better
---
 .../onebrc/CalculateAverage_serkan_ozal.java  | 348 ++++++++++++------
 1 file changed, 230 insertions(+), 118 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
index 0ec485619..53258161e 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -59,7 +59,7 @@ public class CalculateAverage_serkan_ozal {
             ? ByteVector.SPECIES_128
             : ByteVector.SPECIES_64;
     private static final int BYTE_SPECIES_SIZE = BYTE_SPECIES.vectorByteSize();
-    private static final MemorySegment ALL = MemorySegment.NULL.reinterpret(Long.MAX_VALUE);
+    private static final MemorySegment NULL = MemorySegment.NULL.reinterpret(Long.MAX_VALUE);
     private static final ByteOrder NATIVE_BYTE_ORDER = ByteOrder.nativeOrder();
 
     private static final char NEW_LINE_SEPARATOR = '\n';
@@ -290,7 +290,7 @@ private void processRegion() throws Exception {
                     long regionStart = regionGiven ? (r.address() + task.start) : r.address();
                     long regionEnd = regionStart + task.size;
 
-                    doProcessRegion(r, r.address(), regionStart, regionEnd);
+                    doProcessRegion(regionStart, regionEnd);
                 }
 
                 if (VERBOSE) {
@@ -334,105 +334,204 @@ private void processRegion() throws Exception {
             }
         }
 
-        private void doProcessRegion(MemorySegment region, long regionAddress, long regionStart, long regionEnd) {
+        private long findClosestLineEnd(long endPos, long minPos) {
+            int i = 0;
+            int maxI = Math.min(MAX_LINE_LENGTH, (int) (endPos - minPos));
+            while (i <= maxI && U.getByte(endPos - i) != NEW_LINE_SEPARATOR) {
+                i++;
+            }
+            return endPos - i + 1;
+        }
+
+        // Credits: merykitty
+        private long extractValue(long regionPtr, long word, OpenMap map, int entryOffset) {
+            // Parse and extract value
+            int decimalSepPos = Long.numberOfTrailingZeros(~word & 0x10101000);
+            int shift = 28 - decimalSepPos;
+            long signed = (~word << 59) >> 63;
+            long designMask = ~(signed & 0xFF);
+            long digits = ((word & designMask) << shift) & 0x0F000F0F00L;
+            long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            int value = (int) ((absValue ^ signed) - signed);
+
+            // Put extracted value into map
+            map.putValue(entryOffset, value);
+
+            // Return new position
+            return regionPtr + (decimalSepPos >>> 3) + 3;
+        }
+
+        private void doProcessRegion(long regionStart, long regionEnd) {
             final int vectorSize = BYTE_SPECIES.vectorByteSize();
-            final long regionMainLimit = regionEnd - BYTE_SPECIES_SIZE;
 
-            long regionPtr;
+            final long size = regionEnd - regionStart;
+            final long segmentSize = size / 2;
+
+            final long regionStart1 = regionStart;
+            final long regionEnd1 = Math.max(regionStart1, findClosestLineEnd(regionStart1 + segmentSize, regionStart));
+
+            final long regionStart2 = regionEnd1;
+            final long regionEnd2 = regionEnd;
+
+            long regionPtr1, regionPtr2;
 
             // Read and process region - main
-            for (regionPtr = regionStart; regionPtr < regionMainLimit;) {
-                regionPtr = doProcessLine(regionPtr, vectorSize);
-            }
+            // Inspired by: @jerrinot
+            // - two lines at a time (according to my experiment, this is optimum value in terms of register spilling)
+            // - most of the implementation is inlined
+            // - so get the benefit of ILP (Instruction Level Parallelism) better
+            for (regionPtr1 = regionStart1, regionPtr2 = regionStart2; regionPtr1 < regionEnd1 && regionPtr2 < regionEnd2;) {
+                // Search key/value separators and find keys' start and end positions
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
+                long keyStartPtr1 = regionPtr1;
+                long keyStartPtr2 = regionPtr2;
+
+                ByteVector keyVector1 = ByteVector.fromMemorySegment(BYTE_SPECIES, NULL, regionPtr1, NATIVE_BYTE_ORDER);
+                ByteVector keyVector2 = ByteVector.fromMemorySegment(BYTE_SPECIES, NULL, regionPtr2, NATIVE_BYTE_ORDER);
+
+                int keyLength1 = keyVector1.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
+                int keyLength2 = keyVector2.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
+
+                if (keyLength1 != vectorSize && keyLength2 != vectorSize) {
+                    regionPtr1 += (keyLength1 + 1);
+                    regionPtr2 += (keyLength2 + 1);
+                }
+                else {
+                    if (keyLength1 != vectorSize) {
+                        regionPtr1 += (keyLength1 + 1);
+                    }
+                    else {
+                        regionPtr1 += vectorSize;
+                        for (; U.getByte(regionPtr1) != KEY_VALUE_SEPARATOR; regionPtr1++)
+                            ;
+                        keyLength1 = (int) (regionPtr1 - keyStartPtr1);
+                        regionPtr1++;
+                    }
+                    if (keyLength2 != vectorSize) {
+                        regionPtr2 += (keyLength2 + 1);
+                    }
+                    else {
+                        regionPtr2 += vectorSize;
+                        for (; U.getByte(regionPtr2) != KEY_VALUE_SEPARATOR; regionPtr2++)
+                            ;
+                        keyLength2 = (int) (regionPtr2 - keyStartPtr2);
+                        regionPtr2++;
+                    }
+                }
 
-            // Read and process region - tail
-            for (long i = regionPtr, j = regionPtr; i < regionEnd;) {
-                byte b = U.getByte(i);
-                if (b == KEY_VALUE_SEPARATOR) {
-                    long baseOffset = map.putKey(null, j, (int) (i - j));
-                    i = extractValue(i + 1, map, baseOffset);
-                    j = i;
+                // Read first words as they will be used while extracting values later
+                long word1 = U.getLong(regionPtr1);
+                long word2 = U.getLong(regionPtr2);
+                if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+                    word1 = Long.reverseBytes(word1);
+                    word2 = Long.reverseBytes(word2);
+                }
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+                // Calculate key hashes and find entry indexes
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
+                int x1, y1, x2, y2;
+                if (keyLength1 >= Integer.BYTES && keyLength2 >= Integer.BYTES) {
+                    x1 = U.getInt(keyStartPtr1);
+                    y1 = U.getInt(keyStartPtr1 + keyLength1 - Integer.BYTES);
+                    x2 = U.getInt(keyStartPtr2);
+                    y2 = U.getInt(keyStartPtr2 + keyLength2 - Integer.BYTES);
                 }
                 else {
-                    i++;
+                    if (keyLength1 >= Integer.BYTES) {
+                        x1 = U.getInt(keyStartPtr1);
+                        y1 = U.getInt(keyStartPtr1 + keyLength1 - Integer.BYTES);
+                    }
+                    else {
+                        x1 = U.getByte(keyStartPtr1);
+                        y1 = U.getByte(keyStartPtr1 + keyLength1 - Byte.BYTES);
+                    }
+                    if (keyLength2 >= Integer.BYTES) {
+                        x2 = U.getInt(keyStartPtr2);
+                        y2 = U.getInt(keyStartPtr2 + keyLength2 - Integer.BYTES);
+                    }
+                    else {
+                        x2 = U.getByte(keyStartPtr2);
+                        y2 = U.getByte(keyStartPtr2 + keyLength2 - Byte.BYTES);
+                    }
                 }
-            }
-        }
 
-        private long doProcessLine(long regionPtr, int vectorSize) {
-            // Find key/value separator
-            ////////////////////////////////////////////////////////////////////////////////////////////////////////
-            long keyStartPtr = regionPtr;
+                int keyHash1 = (Integer.rotateLeft(x1 * OpenMap.HASH_SEED, OpenMap.HASH_ROTATE) ^ y1) * OpenMap.HASH_SEED;
+                int keyHash2 = (Integer.rotateLeft(x2 * OpenMap.HASH_SEED, OpenMap.HASH_ROTATE) ^ y2) * OpenMap.HASH_SEED;
 
-            // Vectorized search for key/value separator
-            ByteVector keyVector = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, regionPtr, NATIVE_BYTE_ORDER);
+                int entryIdx1 = (keyHash1 & OpenMap.ENTRY_HASH_MASK) << OpenMap.ENTRY_SIZE_SHIFT;
+                int entryIdx2 = (keyHash2 & OpenMap.ENTRY_HASH_MASK) << OpenMap.ENTRY_SIZE_SHIFT;
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-            int keyLength = keyVector.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
-            // Check whether key/value separator is found in the first vector (city name is <= vector size)
-            if (keyLength != vectorSize) {
-                regionPtr += (keyLength + 1);
-            }
-            else {
-                regionPtr += vectorSize;
-                for (; U.getByte(regionPtr) != KEY_VALUE_SEPARATOR; regionPtr++)
-                    ;
-                keyLength = (int) (regionPtr - keyStartPtr);
-                regionPtr++;
-                // I have tried vectorized search for key/value separator in the remaining part,
-                // but since majority (99%) of the city names <= 16 bytes
-                // and other a few longer city names (have length < 16 and <= 32) not close to 32 bytes,
-                // byte by byte search is better in terms of performance (according to my experiments) and simplicity.
-            }
-            ////////////////////////////////////////////////////////////////////////////////////////////////////////
+                // Put keys and calculate entry offsets to put values
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
+                int entryOffset1 = map.putKey(keyVector1, keyStartPtr1, keyLength1, entryIdx1);
+                int entryOffset2 = map.putKey(keyVector2, keyStartPtr2, keyLength2, entryIdx2);
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-            // Put key and get map offset to put value
-            long entryOffset = map.putKey(keyVector, keyStartPtr, keyLength);
+                // Extract values by parsing and put them into map
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
+                regionPtr1 = extractValue(regionPtr1, word1, map, entryOffset1);
+                regionPtr2 = extractValue(regionPtr2, word2, map, entryOffset2);
+                ////////////////////////////////////////////////////////////////////////////////////////////////////////
+            }
 
-            // Extract value, put it into map and return next position in the region to continue processing from there
-            return extractValue(regionPtr, map, entryOffset);
+            // Read and process region - tail
+            doProcessTail(regionPtr1, regionEnd1, regionPtr2, regionEnd2, vectorSize);
         }
-    }
 
-    // Credits: merykitty
-    private static long extractValue(long regionPtr, OpenMap map, long entryOffset) {
-        long word = U.getLong(regionPtr);
-        if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
-            word = Long.reverseBytes(word);
+        private void doProcessTail(long regionPtr1, long regionEnd1, long regionPtr2, long regionEnd2, int vectorSize) {
+            while (regionPtr1 < regionEnd1) {
+                long keyStartPtr1 = regionPtr1;
+                ByteVector keyVector1 = ByteVector.fromMemorySegment(BYTE_SPECIES, NULL, regionPtr1, NATIVE_BYTE_ORDER);
+                int keyLength1 = keyVector1.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
+                if (keyLength1 != vectorSize) {
+                    regionPtr1 += (keyLength1 + 1);
+                }
+                else {
+                    regionPtr1 += vectorSize;
+                    for (; U.getByte(regionPtr1) != KEY_VALUE_SEPARATOR; regionPtr1++)
+                        ;
+                    keyLength1 = (int) (regionPtr1 - keyStartPtr1);
+                    regionPtr1++;
+                }
+                int entryIdx1 = map.calculateEntryIndex(keyStartPtr1, keyLength1);
+                int entryOffset1 = map.putKey(keyVector1, keyStartPtr1, keyLength1, entryIdx1);
+                long word1 = U.getLong(regionPtr1);
+                if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+                    word1 = Long.reverseBytes(word1);
+                }
+                regionPtr1 = extractValue(regionPtr1, word1, map, entryOffset1);
+            }
+            while (regionPtr2 < regionEnd2) {
+                long keyStartPtr2 = regionPtr2;
+                ByteVector keyVector2 = ByteVector.fromMemorySegment(BYTE_SPECIES, NULL, regionPtr2, NATIVE_BYTE_ORDER);
+                int keyLength2 = keyVector2.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
+                if (keyLength2 != vectorSize) {
+                    regionPtr2 += (keyLength2 + 1);
+                }
+                else {
+                    regionPtr2 += vectorSize;
+                    for (; U.getByte(regionPtr2) != KEY_VALUE_SEPARATOR; regionPtr2++)
+                        ;
+                    keyLength2 = (int) (regionPtr2 - keyStartPtr2);
+                    regionPtr2++;
+                }
+                int entryIdx2 = map.calculateEntryIndex(keyStartPtr2, keyLength2);
+                int entryOffset2 = map.putKey(keyVector2, keyStartPtr2, keyLength2, entryIdx2);
+                long word2 = U.getLong(regionPtr2);
+                if (NATIVE_BYTE_ORDER == ByteOrder.BIG_ENDIAN) {
+                    word2 = Long.reverseBytes(word2);
+                }
+                regionPtr2 = extractValue(regionPtr2, word2, map, entryOffset2);
+            }
         }
 
-        // Parse and extract value
-        int decimalSepPos = Long.numberOfTrailingZeros(~word & 0x10101000);
-        int shift = 28 - decimalSepPos;
-        long signed = (~word << 59) >> 63;
-        long designMask = ~(signed & 0xFF);
-        long digits = ((word & designMask) << shift) & 0x0F000F0F00L;
-        long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-        int value = (int) ((absValue ^ signed) - signed);
-
-        // Put extracted value into map
-        map.putValue(entryOffset, value);
-
-        // Return new position
-        return regionPtr + (decimalSepPos >>> 3) + 3;
     }
 
     /**
-     * Region processor request
+     * Region processor task
      */
-    private static final class Request {
-
-        private final Arena arena;
-        private final Queue<Task> sharedTasks;
-        private final Result result;
-
-        private Request(Arena arena, Queue<Task> sharedTasks, Result result) {
-            this.arena = arena;
-            this.sharedTasks = sharedTasks;
-            this.result = result;
-        }
-
-    }
-
     private static final class Task {
 
         private final FileChannel fileChannel;
@@ -451,6 +550,23 @@ private Task(FileChannel fileChannel, MemorySegment region, long start, long end
 
     }
 
+    /**
+     * Region processor request
+     */
+    private static final class Request {
+
+        private final Arena arena;
+        private final Queue<Task> sharedTasks;
+        private final Result result;
+
+        private Request(Arena arena, Queue<Task> sharedTasks, Result result) {
+            this.arena = arena;
+            this.sharedTasks = sharedTasks;
+            this.result = result;
+        }
+
+    }
+
     /**
      * Region processor response
      */
@@ -555,6 +671,9 @@ private void print() {
 
     }
 
+    /**
+     * Custom map implementation to store results
+     */
     private static final class OpenMap {
 
         // Layout
@@ -585,21 +704,22 @@ private static final class OpenMap {
         private static final int ENTRY_MASK = MAP_SIZE - 1;
         private static final int KEY_ARRAY_OFFSET = KEY_OFFSET - Unsafe.ARRAY_BYTE_BASE_OFFSET;
 
+        private static final int HASH_SEED = 0x9E3779B9;
+        private static final int HASH_ROTATE = 5;
+
         private final byte[] data;
-        private final long[] entryOffsets;
+        private final int[] entryOffsets;
         private int entryOffsetIdx;
 
         private OpenMap() {
             this.data = new byte[MAP_SIZE];
             // Max number of unique keys are 10K, so 1 << 14 (16384) is long enough to hold offsets for all of them
-            this.entryOffsets = new long[1 << 14];
+            this.entryOffsets = new int[1 << 14];
             this.entryOffsetIdx = 0;
         }
 
         // Credits: merykitty
-        private static int calculateKeyHash(long address, int keyLength) {
-            int seed = 0x9E3779B9;
-            int rotate = 5;
+        private int calculateEntryIndex(long address, int keyLength) {
             int x, y;
             if (keyLength >= Integer.BYTES) {
                 x = U.getInt(address);
@@ -609,19 +729,17 @@ private static int calculateKeyHash(long address, int keyLength) {
                 x = U.getByte(address);
                 y = U.getByte(address + keyLength - Byte.BYTES);
             }
-            return (Integer.rotateLeft(x * seed, rotate) ^ y) * seed;
+            // Calculate key hash
+            int keyHash = (Integer.rotateLeft(x * HASH_SEED, HASH_ROTATE) ^ y) * HASH_SEED;
+            // Get the position of the entry in the linear map based on calculated hash
+            return (keyHash & ENTRY_HASH_MASK) << ENTRY_SIZE_SHIFT;
         }
 
-        private long putKey(ByteVector keyVector, long keyStartAddress, int keyLength) {
-            // Calculate hash of key
-            int keyHash = calculateKeyHash(keyStartAddress, keyLength);
-            // and get the position of the entry in the linear map based on calculated hash
-            int idx = (keyHash & ENTRY_HASH_MASK) << ENTRY_SIZE_SHIFT;
-
+        private int putKey(ByteVector keyVector, long keyStartAddress, int keyLength, int entryIdx) {
             // Start searching from the calculated position
             // and continue until find an available slot in case of hash collision
             // TODO Prevent infinite loop if all the slots are in use for other keys
-            for (long entryOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + idx;; entryOffset = (entryOffset + ENTRY_SIZE) & ENTRY_MASK) {
+            for (int entryOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET + entryIdx;; entryOffset = (entryOffset + ENTRY_SIZE) & ENTRY_MASK) {
                 int keySize = U.getInt(data, entryOffset + KEY_SIZE_OFFSET);
                 // Check whether current index is empty (no another key is inserted yet)
                 if (keySize == 0) {
@@ -633,32 +751,26 @@ private long putKey(ByteVector keyVector, long keyStartAddress, int keyLength) {
                     entryOffsets[entryOffsetIdx++] = entryOffset;
                     return entryOffset;
                 }
-                int keyStartArrayOffset = (int) entryOffset + KEY_ARRAY_OFFSET;
                 // Check for hash collision (hashes are same, but keys are different).
                 // If there is no collision (both hashes and keys are equals), return current slot's offset.
                 // Otherwise, continue iterating until find an available slot.
-                if (keySize == keyLength && keysEqual(keyVector, keyStartAddress, keyLength, keyStartArrayOffset)) {
+                if (keySize == keyLength && keysEqual(keyVector, keyStartAddress, keyLength, entryOffset + KEY_ARRAY_OFFSET)) {
                     return entryOffset;
                 }
             }
         }
 
         private boolean keysEqual(ByteVector keyVector, long keyStartAddress, int keyLength, int keyStartArrayOffset) {
-            int keyCheckIdx = 0;
-            if (keyVector != null) {
-                // Use vectorized search for the comparison of keys.
-                // Since majority of the city names >= 8 bytes and <= 16 bytes,
-                // this way is more efficient (according to my experiments) than any other comparisons (byte by byte or 2 longs).
-                ByteVector entryKeyVector = ByteVector.fromArray(BYTE_SPECIES, data, keyStartArrayOffset);
-                long eqMask = keyVector.compare(VectorOperators.EQ, entryKeyVector).toLong();
-                int eqCount = Long.numberOfTrailingZeros(~eqMask);
-                if (eqCount >= keyLength) {
-                    return true;
-                }
-                else if (keyLength <= BYTE_SPECIES_SIZE) {
-                    return false;
-                }
-                keyCheckIdx = BYTE_SPECIES_SIZE;
+            // Use vectorized search for the comparison of keys.
+            // Since majority of the city names >= 8 bytes and <= 16 bytes,
+            // this way is more efficient (according to my experiments) than any other comparisons (byte by byte or 2 longs).
+            ByteVector entryKeyVector = ByteVector.fromArray(BYTE_SPECIES, data, keyStartArrayOffset);
+            int eqCount = keyVector.compare(VectorOperators.EQ, entryKeyVector).trueCount();
+            if (eqCount == keyLength) {
+                return true;
+            }
+            else if (keyLength <= BYTE_SPECIES_SIZE) {
+                return false;
             }
 
             // Compare remaining parts of the keys
@@ -671,7 +783,7 @@ else if (keyLength <= BYTE_SPECIES_SIZE) {
             long keyStartOffset = keyStartArrayOffset + Unsafe.ARRAY_BYTE_BASE_OFFSET;
             int alignedKeyLength = normalizedKeyLength & 0xFFFFFFF8;
             int i;
-            for (i = keyCheckIdx; i < alignedKeyLength; i += Long.BYTES) {
+            for (i = BYTE_SPECIES_SIZE; i < alignedKeyLength; i += Long.BYTES) {
                 if (U.getLong(keyStartAddress + i) != U.getLong(data, keyStartOffset + i)) {
                     return false;
                 }
@@ -690,18 +802,18 @@ else if (keyLength <= BYTE_SPECIES_SIZE) {
             return wordA == wordB;
         }
 
-        private void putValue(long entryOffset, int value) {
-            long countOffset = entryOffset + COUNT_OFFSET;
+        private void putValue(int entryOffset, int value) {
+            int countOffset = entryOffset + COUNT_OFFSET;
             U.putInt(data, countOffset, U.getInt(data, countOffset) + 1);
-            long minValueOffset = entryOffset + MIN_VALUE_OFFSET;
+            int minValueOffset = entryOffset + MIN_VALUE_OFFSET;
             if (value < U.getShort(data, minValueOffset)) {
                 U.putShort(data, minValueOffset, (short) value);
             }
-            long maxValueOffset = entryOffset + MAX_VALUE_OFFSET;
+            int maxValueOffset = entryOffset + MAX_VALUE_OFFSET;
             if (value > U.getShort(data, maxValueOffset)) {
                 U.putShort(data, maxValueOffset, (short) value);
             }
-            long sumOffset = entryOffset + VALUE_SUM_OFFSET;
+            int sumOffset = entryOffset + VALUE_SUM_OFFSET;
             U.putLong(data, sumOffset, U.getLong(data, sumOffset) + value);
         }
 
@@ -709,13 +821,13 @@ private void merge(Map<String, KeyResult> resultMap) {
             // Merge this local map into global result map
             Arrays.sort(entryOffsets, 0, entryOffsetIdx);
             for (int i = 0; i < entryOffsetIdx; i++) {
-                long entryOffset = entryOffsets[i];
+                int entryOffset = entryOffsets[i];
                 int keyLength = U.getInt(data, entryOffset + KEY_SIZE_OFFSET);
                 if (keyLength == 0) {
                     // No entry is available for this index, so continue iterating
                     continue;
                 }
-                int entryArrayIdx = (int) (entryOffset + KEY_OFFSET - Unsafe.ARRAY_BYTE_BASE_OFFSET);
+                int entryArrayIdx = entryOffset + KEY_OFFSET - Unsafe.ARRAY_BYTE_BASE_OFFSET;
                 String key = new String(data, entryArrayIdx, keyLength, StandardCharsets.UTF_8);
                 int count = U.getInt(data, entryOffset + COUNT_OFFSET);
                 short minValue = U.getShort(data, entryOffset + MIN_VALUE_OFFSET);

From bee7197ce939f430c035d25257c8b7e75e244042 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 10:08:56 +0100
Subject: [PATCH 212/268] Leaderboard update

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 699fd5683..792642129 100644
--- a/README.md
+++ b/README.md
@@ -41,19 +41,19 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:01.878 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 1 | 00:01.832 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 | 2 | 00:01.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3 | 00:01.970 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 3 | 00:01.948 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+|   | 00:01.970 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.081 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-|   | 00:02.188 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
 |   | 00:02.205 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:02.332 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
-|   | 00:02.575 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
+|   | 00:02.367 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.31-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
-|   | 00:03.258 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
+|   | 00:03.210 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
@@ -77,9 +77,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
+|   | 00:05.354 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
 |   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
-|   | 00:05.709 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:05.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |
@@ -93,6 +93,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java)| 21.0.1-graal | [rcasteltrione](https://github.com/rcasteltrione) |  |
 |   | 00:07.563 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java)| 21.0.1-graal | [3j5a](https://github.com/3j5a) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
+|   | 00:07.712 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.894 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java)| 21.0.2-tem | [Antonio Muñoz](https://github.com/tonivade) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
@@ -104,7 +105,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.517 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ags313.java)| 21.0.1-graal | [ags](https://github.com/ags313) | uses Unsafe |
 |   | 00:08.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [Adrià Cabeza](https://github.com/adriacabeza) |  |
 |   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) | uses Unsafe |
-|   | 00:08.752 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |
 |   | 00:08.892 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fatroom.java)| 21.0.1-open | [Roman Romanchuk](https://github.com/fatroom) |  |
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
 |   | 00:09.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielreid.java)| 21.0.1-open | [Gabriel Reid](https://github.com/gabrielreid) |  |

From afb7e2c390a5124f64d6fa1b7bf2afd50979f068 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 11:59:00 +0100
Subject: [PATCH 213/268] Update README.md

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 792642129..586c60eb5 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,11 @@
 # 1️⃣🐝🏎️ The One Billion Row Challenge
 
+_Status Jan 31: The challenge will close today at midnight CET. No new pull requests will be accepted after that time. Pending PRs will be evaluated over the next few days. Please don't push any changes to pending PRs after today, unless being asked to do so.
+This will be the case if I spot an issue during evaluation (failing tests, etc.). In this case, I will comment on the PR, and you are allowed to push one update.
+I will re-evaluate the entry, and if there are still remaining issues, you'll get one more and last opportunity to update the PR.
+If it still is not valid at this point, it will be closed.
+The final leader board will be published on Monday Feb 5._
+
 _Status Jan 12: As there has been such a large number of entries to this challenge so far (100+), and this is becoming hard to manage, please only create new submissions if you expect them to run in 10 seconds or less on the evaluation machine._
 
 _Status Jan 1: This challenge is [open for submissions](https://www.morling.dev/blog/one-billion-row-challenge/)!_

From ab2a9a6fe5218dc89da96dbc6e79c9f147376e67 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 16:15:50 +0100
Subject: [PATCH 214/268] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 586c60eb5..27f6e9c4d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # 1️⃣🐝🏎️ The One Billion Row Challenge
 
-_Status Jan 31: The challenge will close today at midnight CET. No new pull requests will be accepted after that time. Pending PRs will be evaluated over the next few days. Please don't push any changes to pending PRs after today, unless being asked to do so.
+_Status Jan 31: The challenge will close today at midnight UTC. No new pull requests will be accepted after that time. Pending PRs will be evaluated over the next few days. Please don't push any changes to pending PRs after today, unless being asked to do so.
 This will be the case if I spot an issue during evaluation (failing tests, etc.). In this case, I will comment on the PR, and you are allowed to push one update.
 I will re-evaluate the entry, and if there are still remaining issues, you'll get one more and last opportunity to update the PR.
 If it still is not valid at this point, it will be closed.

From 0c5c22882b6750b0e3603e327694fbf4401c15d8 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Wed, 31 Jan 2024 16:06:05 +0000
Subject: [PATCH 215/268] Process two consecutive lines at a time (#651)

Use a better hash function

Don't return index from temperature parsing
extra JVM args

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 calculate_average_ianopolousfast.sh           |   2 +
 .../CalculateAverage_ianopolousfast.java      | 150 ++++++++----------
 2 files changed, 65 insertions(+), 87 deletions(-)

diff --git a/calculate_average_ianopolousfast.sh b/calculate_average_ianopolousfast.sh
index 06c31d9e5..4ed77c70a 100755
--- a/calculate_average_ianopolousfast.sh
+++ b/calculate_average_ianopolousfast.sh
@@ -16,4 +16,6 @@
 #
 
 JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector"
+#-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 -XX:-UseTransparentHugePages"
+
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ianopolousfast
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
index 417abcfbe..92e2f6ecb 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java
@@ -19,7 +19,6 @@
 import jdk.incubator.vector.VectorOperators;
 import jdk.incubator.vector.VectorSpecies;
 
-import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.nio.ByteOrder;
@@ -39,10 +38,7 @@
  * * read chunks in parallel
  * * minimise allocation
  * * no unsafe
- *
- * Timings on 4 core i7-7500U CPU @ 2.70GHz:
- * average_baseline: 4m48s
- * ianopolous:         13.8s
+ * * process multiple lines in each thread for better ILP
 */
 public class CalculateAverage_ianopolousfast {
 
@@ -91,11 +87,22 @@ public static boolean matchingStationBytes(long start, long end, MemorySegment b
         return true;
     }
 
-    private static int hashToIndex(long hash, int len) {
-        // From Thomas Wuerthinger's entry
-        int hashAsInt = (int) (hash ^ (hash >>> 28));
-        int finalHash = (hashAsInt ^ (hashAsInt >>> 15));
-        return (finalHash & (len - 1));
+    private static final int GOLDEN_RATIO = 0x9E3779B9;
+    private static final int HASH_LROTATE = 5;
+
+    // hash from giovannicuccu
+    private static int hash(MemorySegment memorySegment, long start, int len) {
+        int x;
+        int y;
+        if (len >= Integer.BYTES) {
+            x = memorySegment.get(JAVA_INT_UNALIGNED, start);
+            y = memorySegment.get(JAVA_INT_UNALIGNED, start + len - Integer.BYTES);
+        }
+        else {
+            x = memorySegment.get(JAVA_BYTE, start);
+            y = memorySegment.get(JAVA_BYTE, start + len - Byte.BYTES);
+        }
+        return (Integer.rotateLeft(x * GOLDEN_RATIO, HASH_LROTATE) ^ y) * GOLDEN_RATIO;
     }
 
     public static Stat createStation(long start, long end, MemorySegment buffer) {
@@ -105,8 +112,9 @@ public static Stat createStation(long start, long end, MemorySegment buffer) {
         return new Stat(stationBuffer);
     }
 
-    public static Stat dedupeStation(long start, long end, long hash, MemorySegment buffer, Stat[] stations) {
-        int index = hashToIndex(hash, MAX_STATIONS);
+    public static Stat dedupeStation(long start, long end, MemorySegment buffer, Stat[] stations) {
+        int hash = hash(buffer, start, (int) (end - start));
+        int index = hash & (MAX_STATIONS - 1);
         Stat match = stations[index];
         while (match != null) {
             if (matchingStationBytes(start, end, buffer, match))
@@ -119,37 +127,11 @@ public static Stat dedupeStation(long start, long end, long hash, MemorySegment
         return res;
     }
 
-    static long maskHighBytes(long d, int nbytes) {
-        return d & (-1L << ((8 - nbytes) * 8));
-    }
-
-    public static Stat parseStation(long lineStart, MemorySegment buffer, Stat[] stations) {
-        ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart, ByteOrder.nativeOrder());
-        int keySize = line.compare(VectorOperators.EQ, ';').firstTrue();
-
-        long first8 = buffer.get(LONG_LAYOUT, lineStart);
-        long second8 = 0;
-        if (keySize <= 8) {
-            first8 = maskHighBytes(first8, keySize & 0x07);
-        }
-        else if (keySize < 16) {
-            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
-        }
-        else if (keySize == BYTE_SPECIES.vectorByteSize()) {
-            while (buffer.get(JAVA_BYTE, lineStart + keySize) != ';') {
-                keySize++;
-            }
-            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
-        }
-        long hash = first8 ^ second8; // todo include later bytes
-        return dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
-    }
-
     public static short getMinus(long d) {
         return ((d & 0xff00000000000000L) ^ 0x2d00000000000000L) != 0 ? 0 : (short) -1;
     }
 
-    public static long processTemperature(long lineSplit, int size, MemorySegment buffer, Stat station) {
+    public static void processTemperature(long lineSplit, int size, MemorySegment buffer, Stat station) {
         long d = buffer.get(LONG_LAYOUT, lineSplit);
         // negative is either 0 or -1
         short negative = getMinus(d);
@@ -162,10 +144,9 @@ public static long processTemperature(long lineSplit, int size, MemorySegment bu
                 100 * (((byte) (d >> 24)) - '0'));
         temperature = (short) ((temperature ^ negative) - negative); // negative treatment inspired by merkitty
         station.add(temperature);
-        return lineSplit + size + 1;
     }
 
-    private static long parseLine(long lineStart, MemorySegment buffer, Stat[] stations) {
+    private static int lineSize(long lineStart, MemorySegment buffer) {
         ByteVector line = ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart, ByteOrder.nativeOrder());
         int lineSize = line.compare(VectorOperators.EQ, '\n').firstTrue();
         int index = lineSize;
@@ -174,33 +155,19 @@ private static long parseLine(long lineStart, MemorySegment buffer, Stat[] stati
                     ByteOrder.nativeOrder()).compare(VectorOperators.EQ, '\n').firstTrue();
             lineSize += index;
         }
-        int keySize = lineSize - 6 + ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart + lineSize - 6,
-                ByteOrder.nativeOrder()).compare(VectorOperators.EQ, ';').firstTrue();
+        return lineSize;
+    }
 
-        long first8 = buffer.get(LONG_LAYOUT, lineStart);
-        long second8 = 0;
-        if (keySize <= 8) {
-            first8 = maskHighBytes(first8, keySize & 0x07);
-        }
-        else if (keySize < 16) {
-            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
-        }
-        else if (keySize == BYTE_SPECIES.vectorByteSize()) {
-            while (buffer.get(JAVA_BYTE, lineStart + keySize) != ';') {
-                keySize++;
-            }
-            second8 = maskHighBytes(buffer.get(LONG_LAYOUT, lineStart + 8), keySize & 0x07);
-        }
-        long hash = first8 ^ second8; // todo include later bytes
-        Stat station = dedupeStation(lineStart, lineStart + keySize, hash, buffer, stations);
-        return processTemperature(lineStart + keySize + 1, lineSize - keySize - 1, buffer, station);
+    private static int keySize(int lineSize, long lineStart, MemorySegment buffer) {
+        return lineSize - 6 + ByteVector.fromMemorySegment(BYTE_SPECIES, buffer, lineStart + lineSize - 6,
+                ByteOrder.nativeOrder()).compare(VectorOperators.EQ, ';').firstTrue();
     }
 
-    public static Stat[] parseStats(long startByte, long endByte, MemorySegment buffer) {
+    public static Stat[] parseStats(long start1, long end2, MemorySegment buffer) {
         // read first partial line
-        if (startByte > 0) {
+        if (start1 > 0) {
             for (int i = 0; i < MAX_LINE_LENGTH; i++) {
-                byte b = buffer.get(JAVA_BYTE, startByte++);
+                byte b = buffer.get(JAVA_BYTE, start1++);
                 if (b == '\n') {
                     break;
                 }
@@ -213,38 +180,47 @@ public static Stat[] parseStats(long startByte, long endByte, MemorySegment buff
         // this allows us to not worry about reading beyond the end
         // in the inner loop (reducing branches)
         // We need at least the vector lane size bytes back
-        if (endByte == buffer.byteSize()) {
+        if (end2 == buffer.byteSize()) {
             // reverse at least vector lane width
-            endByte = Math.max(buffer.byteSize() - BYTE_SPECIES.vectorByteSize(), 0);
-            while (endByte > 0 && buffer.get(JAVA_BYTE, endByte) != '\n')
-                endByte--;
+            end2 = Math.max(buffer.byteSize() - 2 * BYTE_SPECIES.vectorByteSize(), 0);
+            while (end2 > 0 && buffer.get(JAVA_BYTE, end2) != '\n')
+                end2--;
 
-            if (endByte > 0)
-                endByte++;
+            if (end2 > 0)
+                end2++;
             // copy into a larger buffer to avoid reading off end
-            MemorySegment end = Arena.global().allocate(MAX_LINE_LENGTH + BYTE_SPECIES.vectorByteSize());
-            for (long i = endByte; i < buffer.byteSize(); i++)
-                end.set(JAVA_BYTE, i - endByte, buffer.get(JAVA_BYTE, i));
+            MemorySegment end = Arena.global().allocate(MAX_LINE_LENGTH + 2 * BYTE_SPECIES.vectorByteSize());
+            for (long i = end2; i < buffer.byteSize(); i++)
+                end.set(JAVA_BYTE, i - end2, buffer.get(JAVA_BYTE, i));
             int index = 0;
-            while (endByte + index < buffer.byteSize()) {
-                Stat station = parseStation(index, end, stations);
-                int tempSize = 3;
-                if (end.get(JAVA_BYTE, index + station.namelen + 5) == '\n')
-                    tempSize = 4;
-                if (end.get(JAVA_BYTE, index + station.namelen + 6) == '\n')
-                    tempSize = 5;
-                index = (int) processTemperature(index + station.namelen + 1, tempSize, end, station);
+            while (end2 + index < buffer.byteSize()) {
+                int lineSize1 = lineSize(index, end);
+                int semiSearchStart = index + Math.max(0, lineSize1 - 6);
+                int keySize1 = semiSearchStart - index + ByteVector.fromMemorySegment(BYTE_SPECIES, end, semiSearchStart,
+                        ByteOrder.nativeOrder()).compare(VectorOperators.EQ, ';').firstTrue();
+                Stat station1 = dedupeStation(index, index + keySize1, end, stations);
+                processTemperature(index + keySize1 + 1, lineSize1 - keySize1 - 1, end, station1);
+                index += lineSize1 + 1;
             }
         }
 
-        innerloop(startByte, endByte, buffer, stations);
-        return stations;
-    }
-
-    private static void innerloop(long startByte, long endByte, MemorySegment buffer, Stat[] stations) {
-        while (startByte < endByte) {
-            startByte = parseLine(startByte, buffer, stations);
+        while (start1 < end2) {
+            int lineSize1 = lineSize(start1, buffer);
+            long start2 = start1 + lineSize1 + 1;
+            int lineSize2 = start2 < end2 ? lineSize(start2, buffer) : 0;
+            int keySize1 = keySize(lineSize1, start1, buffer);
+            int keySize2 = keySize(lineSize2, start2, buffer);
+            Stat station1 = dedupeStation(start1, start1 + keySize1, buffer, stations);
+            processTemperature(start1 + keySize1 + 1, lineSize1 - keySize1 - 1, buffer, station1);
+            if (start2 < end2) {
+                Stat station2 = dedupeStation(start2, start2 + keySize2, buffer, stations);
+                processTemperature(start2 + keySize2 + 1, lineSize2 - keySize2 - 1, buffer, station2);
+                start1 = start2 + lineSize2 + 1;
+            }
+            else
+                start1 += lineSize1 + 1;
         }
+        return stations;
     }
 
     public static class Stat {

From f55317973c1a134daea9e8225cf6631345e6cada Mon Sep 17 00:00:00 2001
From: nicky <nickyreinert@gmail.com>
Date: Wed, 31 Jan 2024 17:07:29 +0100
Subject: [PATCH 216/268] batched writing to disk (#659)

instead of writing result line by line, implemented random.choices for randomisation of multiple stations and writing large batche ot the disk, also instead of "round" just using :.1f which is probably quicker on a large scale, because it's not a mathematical function
---
 src/main/python/create_measurements.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/main/python/create_measurements.py b/src/main/python/create_measurements.py
index 4125828e8..26ec768a0 100755
--- a/src/main/python/create_measurements.py
+++ b/src/main/python/create_measurements.py
@@ -110,15 +110,18 @@ def build_test_data(weather_station_names, num_rows_to_create):
     coldest_temp = -99.9
     hottest_temp = 99.9
     station_names_10k_max = random.choices(weather_station_names, k=10_000)
-    progress_step = max(1, int(num_rows_to_create / 100))
+    batch_size = 10000 # instead of writing line by line to file, process a batch of stations and put it to disk
+    progress_step = max(1, (num_rows_to_create // batch_size) // 100)
     print('Building test data...')
 
     try:
         with open("../../../data/measurements.txt", 'w') as file:
-            for s in range(0,num_rows_to_create):
-                random_station = random.choice(station_names_10k_max)
-                random_temp = round(random.uniform(coldest_temp, hottest_temp), 1)
-                file.write(f"{random_station};{random_temp}\n")
+            for s in range(0,num_rows_to_create // batch_size):
+                
+                batch = random.choices(station_names_10k_max, k=batch_size)
+                prepped_deviated_batch = '\n'.join([f"{station};{random.uniform(coldest_temp, hottest_temp):.1f}" for station in batch]) # :.1f should quicker than round on a large scale, because round utilizes mathematical operation
+                file.write(prepped_deviated_batch + '\n')
+                
                 # Update progress bar every 1%
                 if s % progress_step == 0 or s == num_rows_to_create - 1:
                     sys.stdout.write('\r')

From 5728ca9482dbd4257d6cf23197bede18ab1f1db1 Mon Sep 17 00:00:00 2001
From: Andrzej Nestoruk <and.nestoruk@gmail.com>
Date: Wed, 31 Jan 2024 17:26:22 +0100
Subject: [PATCH 217/268] copy city byte array only when creating a new record
 (#653)

---
 .../onebrc/CalculateAverage_anestoruk.java    | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
index 293087e2f..07440d7b0 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java
@@ -49,7 +49,7 @@ public static void main(String[] args) throws IOException {
 
         try (FileChannel channel = FileChannel.open(Path.of(path))) {
             final long fileSize = channel.size();
-            final long chunkSize = fileSize > 10_000 ? min(Integer.MAX_VALUE - 256, fileSize / cpus) : fileSize;
+            final long chunkSize = calculateChunkSize(fileSize);
             final int chunks = (int) ceil((double) fileSize / chunkSize);
             segment = channel.map(READ_ONLY, 0, fileSize, Arena.global());
             long startOffset = 0;
@@ -85,6 +85,18 @@ public static void main(String[] args) throws IOException {
         System.out.println(result);
     }
 
+    private static long calculateChunkSize(long fileSize) {
+        int divisor = cpus;
+        long chunkSize;
+        if (fileSize > 10_000) {
+            while ((chunkSize = fileSize / divisor) > Integer.MAX_VALUE - 512) {
+                divisor *= 2;
+            }
+            return chunkSize;
+        }
+        return fileSize;
+    }
+
     private static Record[] process(SegmentRange range, MemorySegment segment) {
         Record[] records = new Record[1024 * 100];
         byte[] cityBuffer = new byte[100];
@@ -113,23 +125,23 @@ private static Record[] process(SegmentRange range, MemorySegment segment) {
                 }
             }
             int temperature = negative ? -value : value;
-            byte[] city = new byte[cityLength];
-            System.arraycopy(cityBuffer, 0, city, 0, cityLength);
-            addResult(records, hash, city, temperature);
+            addRecord(records, hash, cityBuffer, cityLength, temperature);
         }
         return records;
     }
 
-    private static void addResult(Record[] records, int hash, byte[] city, int temperature) {
+    private static void addRecord(Record[] records, int hash, byte[] cityBuffer, int cityLength, int temperature) {
         int idx = hash % records.length;
         Record record;
         while ((record = records[idx]) != null) {
-            if (record.hash == hash && Arrays.equals(record.city, city)) {
+            if (record.hash == hash && Arrays.equals(record.city, 0, record.city.length, cityBuffer, 0, cityLength)) {
                 record.add(temperature);
                 return;
             }
             idx = (idx + 1) % records.length;
         }
+        byte[] city = new byte[cityLength];
+        System.arraycopy(cityBuffer, 0, city, 0, cityLength);
         records[idx] = new Record(hash, city, temperature);
     }
 

From e2ee4cbc30fb010097ab4fd3100d72e3e0d47239 Mon Sep 17 00:00:00 2001
From: John Ziamos <iziamos@gmail.com>
Date: Wed, 31 Jan 2024 16:41:40 +0000
Subject: [PATCH 218/268] give in to the graal (#660)

---
 calculate_average_iziamos.sh                  | 23 +++--
 prepare_iziamos.sh                            | 24 ++++++
 .../onebrc/CalculateAverage_iziamos.java      | 85 ++++++++-----------
 3 files changed, 74 insertions(+), 58 deletions(-)
 create mode 100755 prepare_iziamos.sh

diff --git a/calculate_average_iziamos.sh b/calculate_average_iziamos.sh
index 755dddc77..0f9178797 100755
--- a/calculate_average_iziamos.sh
+++ b/calculate_average_iziamos.sh
@@ -15,11 +15,18 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview
-  -XX:+UnlockExperimentalVMOptions \
-  -XX:+UseEpsilonGC -Xms16m -Xmx16m -XX:-AlwaysPreTouch \
-  -XX:-TieredCompilation -XX:CICompilerCount=1 -XX:CompilationMode=high-only \
-  -XX:C1MaxTrivialSize=500 -XX:-UseCountedLoopSafepoints -XX:+UseCMoveUnconditionally -XX:+DisableAttachMechanism \
-  -XX:-PreserveFramePointer -Xnoclassgc -disablesystemassertions -XX:-UsePerfData  \
-  -XX:-UseTransparentHugePages -XX:-UseCompressedOops"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_iziamos
+
+if [ -f target/CalculateAverage_iziamos_image ]; then
+    echo "Using graal" 1>&2
+    target/CalculateAverage_iziamos_image
+else
+    echo "Using openjdk" 1>&2
+    JAVA_OPTS="--enable-preview
+      -XX:+UnlockExperimentalVMOptions \
+      -XX:+UseEpsilonGC -Xms16m -Xmx16m -XX:-AlwaysPreTouch \
+      -XX:-TieredCompilation -XX:CICompilerCount=1 -XX:CompilationMode=high-only \
+      -XX:C1MaxTrivialSize=500 -XX:-UseCountedLoopSafepoints -XX:+UseCMoveUnconditionally -XX:+DisableAttachMechanism \
+      -XX:-PreserveFramePointer -Xnoclassgc -disablesystemassertions -XX:-UsePerfData  \
+      -XX:-UseTransparentHugePages -XX:-UseCompressedOops"
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_iziamos
+fi
diff --git a/prepare_iziamos.sh b/prepare_iziamos.sh
new file mode 100755
index 000000000..621937ca2
--- /dev/null
+++ b/prepare_iziamos.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_iziamos_image ]; then
+    NATIVE_IMAGE_OPTS="-H:+UnlockExperimentalVMOptions --gc=epsilon -O3 -march=native -R:MaxHeapSize=64m -H:-GenLoopSafepoints --enable-preview -H:InlineAllBonus=10 -H:-ParseRuntimeOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_iziamos"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_iziamos_image dev.morling.onebrc.CalculateAverage_iziamos
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java b/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
index faa60547d..f4ca68f36 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java
@@ -15,12 +15,9 @@
  */
 package dev.morling.onebrc;
 
-import sun.misc.Unsafe;
-
 import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.lang.foreign.Arena;
-import java.lang.foreign.MemorySegment;
-import java.lang.reflect.Field;
 import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -34,45 +31,42 @@
 import static java.nio.file.StandardOpenOption.READ;
 
 public class CalculateAverage_iziamos {
-    private static final Unsafe UNSAFE;
+    private static final sun.misc.Unsafe UNSAFE = initUnsafe();
 
-    private static final String FILE = "./measurements.txt";
-    private static final Arena GLOBAL_ARENA = Arena.global();
-    private final static MemorySegment WHOLE_FILE_SEGMENT;
-    private final static long FILE_SIZE;
-    private final static long BASE_POINTER;
-    private final static long END_POINTER;
-
-    static {
+    private static sun.misc.Unsafe initUnsafe() {
         try {
-            final Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            java.lang.reflect.Field theUnsafe = sun.misc.Unsafe.class.getDeclaredField("theUnsafe");
             theUnsafe.setAccessible(true);
-            UNSAFE = (Unsafe) theUnsafe.get(Unsafe.class);
-
-            final var fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE), READ);
-            WHOLE_FILE_SEGMENT = fileChannel.map(READ_ONLY, 0, fileChannel.size(), GLOBAL_ARENA);
-
+            return (sun.misc.Unsafe) theUnsafe.get(sun.misc.Unsafe.class);
         }
-        catch (final NoSuchFieldException | IllegalAccessException | IOException e) {
+        catch (NoSuchFieldException | IllegalAccessException e) {
             throw new RuntimeException(e);
         }
-
-        FILE_SIZE = WHOLE_FILE_SEGMENT.byteSize();
-        BASE_POINTER = WHOLE_FILE_SEGMENT.address();
-        END_POINTER = BASE_POINTER + FILE_SIZE;
     }
 
-    private static final long CHUNK_SIZE = 64 * 1024 * 1024;
-    // private static final long CHUNK_SIZE = Long.MAX_VALUE;
+    private static final String FILE = "./measurements.txt";
+    private static final Arena GLOBAL_ARENA = Arena.global();
 
     public static void main(String[] args) throws Exception {
-        // Thread.sleep(10_000);
+        // final long chunkSize = Long.MAX_VALUE;
+        final long chunkSize = 64 * 1024 * 1024;
+
+        final FileChannel fileChannel;
+        try {
+            fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE), READ);
+        }
+        catch (final IOException e) {
+            throw new UncheckedIOException(e);
+        }
 
-        final long threadCount = 1 + FILE_SIZE / CHUNK_SIZE;
+        final var seg = fileChannel.map(READ_ONLY, 0, fileChannel.size(), GLOBAL_ARENA);
+
+        final long fileSize = seg.byteSize();
+        final long threadCount = 1 + fileSize / chunkSize;
 
         final var processingFutures = new CompletableFuture[(int) threadCount];
         for (int i = 0; i < threadCount; ++i) {
-            processingFutures[i] = processSegment(i, CHUNK_SIZE);
+            processingFutures[i] = processSegment(seg.address(), seg.address() + fileSize, i, chunkSize);
         }
 
         final long aggregate = (long) processingFutures[0].get();
@@ -102,15 +96,18 @@ private double round(double value) {
         }
     }
 
-    private static CompletableFuture<Long> processSegment(final long chunkNumber, final long chunkSize) {
+    private static CompletableFuture<Long> processSegment(final long basePointer,
+                                                          final long endPointer,
+                                                          final long chunkNumber,
+                                                          final long chunkSize) {
         final var ret = new CompletableFuture<Long>();
 
         Thread.ofVirtual().start(() -> {
             final long relativeStart = chunkNumber * chunkSize;
-            final long absoluteStart = BASE_POINTER + relativeStart;
+            final long absoluteStart = basePointer + relativeStart;
 
-            final long absoluteEnd = computeAbsoluteEndWithSlack(absoluteStart + chunkSize);
-            final long startOffsetAfterSkipping = skipIncomplete(WHOLE_FILE_SEGMENT.address(), absoluteStart);
+            final long absoluteEnd = computeAbsoluteEndWithSlack(absoluteStart + chunkSize, endPointer);
+            final long startOffsetAfterSkipping = skipIncomplete(basePointer, absoluteStart);
 
             final long result = processEvents(startOffsetAfterSkipping, absoluteEnd);
             ret.complete(result);
@@ -119,8 +116,8 @@ private static CompletableFuture<Long> processSegment(final long chunkNumber, fi
         return ret;
     }
 
-    private static long computeAbsoluteEndWithSlack(final long chunk) {
-        return Long.compareUnsigned(END_POINTER, chunk) > 0 ? chunk : END_POINTER;
+    private static long computeAbsoluteEndWithSlack(final long chunk, final long endPointer) {
+        return Long.compareUnsigned(endPointer, chunk) > 0 ? chunk : endPointer;
     }
 
     private static long skipIncomplete(final long basePointer, final long start) {
@@ -142,7 +139,7 @@ private static long processEvents(final long start, final long limit) {
     }
 
     private static void scalarLoop(final long start, final long limit, final long result) {
-        final LoopCursor cursor = new ScalarLoopCursor(start, limit);
+        final LoopCursor cursor = new LoopCursor(start, limit);
         while (cursor.hasMore()) {
             final long address = cursor.getCurrentAddress();
             final int length = cursor.getStringLength();
@@ -152,25 +149,13 @@ private static void scalarLoop(final long start, final long limit, final long re
         }
     }
 
-    public interface LoopCursor {
-        long getCurrentAddress();
-
-        int getStringLength();
-
-        int getHash();
-
-        int getCurrentValue();
-
-        boolean hasMore();
-    }
-
-    public static class ScalarLoopCursor implements LoopCursor {
+    public static class LoopCursor {
         private long pointer;
         private final long limit;
 
         private int hash = 0;
 
-        public ScalarLoopCursor(final long pointer, final long limit) {
+        public LoopCursor(final long pointer, final long limit) {
             this.pointer = pointer;
             this.limit = limit;
         }

From 11a89d6cb81c513a9c1624740a97b43fc737417b Mon Sep 17 00:00:00 2001
From: zerninv <zerninvasilii@yandex.ru>
Date: Wed, 31 Jan 2024 16:44:50 +0000
Subject: [PATCH 219/268] Attempt to fix segfault CalculateAverage_zerninv.java
 (#635)

* attempt to fix segfault, graal native

* fix last bytes for last line handler

* fix typo

* one more attempt
---
 calculate_average_zerninv.sh                  | 10 ++-
 prepare_zerninv.sh                            |  7 +-
 .../onebrc/CalculateAverage_zerninv.java      | 72 ++++++++++++-------
 3 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/calculate_average_zerninv.sh b/calculate_average_zerninv.sh
index 2b76c7d7d..6dbda3022 100755
--- a/calculate_average_zerninv.sh
+++ b/calculate_average_zerninv.sh
@@ -15,5 +15,11 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_zerninv
\ No newline at end of file
+if [ -f target/CalculateAverage_zerninv_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_zerninv_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_zerninv_image
+else
+    JAVA_OPTS="--enable-preview -Xmx512m -XX:+UseSerialGC -XX:-TieredCompilation"
+    echo "Chosing to run the app in JVM mode as no native image was found, use prepare_zerninv.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_zerninv
+fi
\ No newline at end of file
diff --git a/prepare_zerninv.sh b/prepare_zerninv.sh
index cd3641e0e..ae7343301 100755
--- a/prepare_zerninv.sh
+++ b/prepare_zerninv.sh
@@ -17,4 +17,9 @@
 
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
\ No newline at end of file
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_zerninv_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=512m -H:-GenLoopSafepoints --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_zerninv"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_zerninv_image dev.morling.onebrc.CalculateAverage_zerninv
+fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
index b28750f77..47974ce67 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java
@@ -56,8 +56,7 @@ public static void main(String[] args) throws IOException, InterruptedException
                 tasks[i] = new TaskThread((int) (fileSize / minChunkSize / CORES + 1));
             }
 
-            var results = new HashMap<String, TemperatureAggregation>();
-            var chunks = splitByChunks(segment.address(), segment.address() + fileSize, minChunkSize, results);
+            var chunks = splitByChunks(segment.address(), segment.address() + fileSize, minChunkSize);
             for (int i = 0; i < chunks.size() - 1; i++) {
                 var task = tasks[i % tasks.length];
                 task.addChunk(chunks.get(i), chunks.get(i + 1));
@@ -67,6 +66,7 @@ public static void main(String[] args) throws IOException, InterruptedException
                 task.start();
             }
 
+            var results = new HashMap<String, TemperatureAggregation>();
             for (var task : tasks) {
                 task.join();
                 task.collectTo(results);
@@ -79,31 +79,8 @@ public static void main(String[] args) throws IOException, InterruptedException
         }
     }
 
-    private static List<Long> splitByChunks(long address, long end, long minChunkSize, Map<String, TemperatureAggregation> results) {
-        // handle last line
-        long offset = end - 1;
-        int temperature = 0;
-        byte b;
-        int multiplier = 1;
-        while ((b = UNSAFE.getByte(offset--)) != ';') {
-            if (b >= '0' && b <= '9') {
-                temperature += (b - '0') * multiplier;
-                multiplier *= 10;
-            }
-            else if (b == '-') {
-                temperature = -temperature;
-            }
-        }
-        long cityNameEnd = offset;
-        while (UNSAFE.getByte(offset - 1) != '\n' && offset > address) {
-            offset--;
-        }
-        var cityName = new byte[(int) (cityNameEnd - offset + 1)];
-        UNSAFE.copyMemory(null, offset, cityName, Unsafe.ARRAY_BYTE_BASE_OFFSET, cityName.length);
-        results.put(new String(cityName, StandardCharsets.UTF_8), new TemperatureAggregation(temperature, 1, (short) temperature, (short) temperature));
-
+    private static List<Long> splitByChunks(long address, long end, long minChunkSize) {
         // split by chunks
-        end = offset;
         List<Long> result = new ArrayList<>((int) ((end - address) / minChunkSize + 1));
         result.add(address);
         while (address < end) {
@@ -278,8 +255,49 @@ public void addChunk(long begin, long end) {
         @Override
         public void run() {
             for (int i = 0; i < begins.size(); i++) {
-                calcForChunk(begins.get(i), ends.get(i));
+                var begin = begins.get(i);
+                var end = ends.get(i) - 1;
+                while (end > begin && UNSAFE.getByte(end - 1) != '\n') {
+                    end--;
+                }
+                calcForChunk(begin, end);
+                calcLastLine(end);
+            }
+        }
+
+        private void calcLastLine(long offset) {
+            long cityOffset = offset;
+            long lastBytes = 0;
+            int hashCode = 0;
+            byte cityNameSize = 0;
+
+            byte b;
+            while ((b = UNSAFE.getByte(offset++)) != ';') {
+                lastBytes = (lastBytes << 8) | b;
+                hashCode = hashCode * 31 + b;
+                cityNameSize++;
+            }
+
+            int temperature;
+            int word = UNSAFE.getInt(offset);
+            offset += 4;
+
+            if ((word & TWO_NEGATIVE_DIGITS_MASK) == TWO_NEGATIVE_DIGITS_MASK) {
+                word >>>= 8;
+                temperature = ZERO * 11 - ((word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK));
+            }
+            else if ((word & THREE_DIGITS_MASK) == THREE_DIGITS_MASK) {
+                temperature = (word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK) - ZERO * 111;
+            }
+            else if ((word & TWO_DIGITS_MASK) == TWO_DIGITS_MASK) {
+                temperature = (word & BYTE_MASK) * 10 + ((word >>> 16) & BYTE_MASK) - ZERO * 11;
+            }
+            else {
+                // #.##-
+                word = (word >>> 8) | (UNSAFE.getByte(offset) << 24);
+                temperature = ZERO * 111 - ((word & BYTE_MASK) * 100 + ((word >>> 8) & BYTE_MASK) * 10 + ((word >>> 24) & BYTE_MASK));
             }
+            container.put(cityOffset, cityNameSize, hashCode, lastBytes, (short) temperature);
         }
 
         private void calcForChunk(long offset, long end) {

From 711aa2395a5ba8433ce3fdbe77d07b4e8a23a114 Mon Sep 17 00:00:00 2001
From: Judekeyser <44140459+Judekeyser@users.noreply.github.com>
Date: Wed, 31 Jan 2024 17:59:50 +0100
Subject: [PATCH 220/268] Justin's implementation (#666)

* Justin's implementation

* Rename justin to Judekeyser

* Back to previous implementation of vectors

* Reading names as sequences of integers

* Fixing tests

* Scale down the number of NIO workers

---------

Co-authored-by: Justin Dekeyser <justin.dekeyser@Justins-MacBook-Pro.local>
---
 calculate_average_Judekeyser.sh               |  19 +
 prepare_Judekeyser.sh                         |  20 +
 .../onebrc/CalculateAverage_Judekeyser.java   | 414 ++++++++++++++++++
 3 files changed, 453 insertions(+)
 create mode 100755 calculate_average_Judekeyser.sh
 create mode 100755 prepare_Judekeyser.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_Judekeyser.java

diff --git a/calculate_average_Judekeyser.sh b/calculate_average_Judekeyser.sh
new file mode 100755
index 000000000..9490c15c5
--- /dev/null
+++ b/calculate_average_Judekeyser.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --add-modules jdk.incubator.vector"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_Judekeyser
diff --git a/prepare_Judekeyser.sh b/prepare_Judekeyser.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_Judekeyser.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_Judekeyser.java b/src/main/java/dev/morling/onebrc/CalculateAverage_Judekeyser.java
new file mode 100644
index 000000000..40bcf08e0
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_Judekeyser.java
@@ -0,0 +1,414 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorSpecies;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.io.UncheckedIOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.util.*;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import static java.lang.foreign.ValueLayout.OfByte.JAVA_BYTE;
+import static java.lang.foreign.ValueLayout.OfByte.JAVA_INT_UNALIGNED;
+
+public class CalculateAverage_Judekeyser {
+    private static final String FILE = "./measurements.txt";
+    private static final int chunkSize = (1 << 7) << 12; // This can't go beyond 2^21, because otherwise we might exceed int capacity
+
+    private static final int numberOfIOWorkers = 1 << 8; // We are going to need (numberOfIOWorkers-1) * chunkSize capacity
+    private static final int numberOfParallelWorkers = Runtime.getRuntime().availableProcessors() - 1;
+
+    private static final VectorSpecies<Byte> SPECIES = ByteVector.SPECIES_PREFERRED;
+
+    public static void main(String[] args) throws Exception {
+        class SimpleStatistics {
+            int min, max, sum, count;
+            SimpleStatistics() {
+                min = Integer.MAX_VALUE;
+                max = Integer.MIN_VALUE;
+                sum = 0;
+                count = 0;
+            }
+
+            void accept(int value) {
+                min = Math.min(min, value);
+                max = Math.max(max, value);
+                sum += value;
+                count++;
+            }
+        }
+        class Statistics {
+            double min, max, avg;
+            long count;
+            Statistics(SimpleStatistics simple) {
+                min = simple.min/10.;
+                max = simple.max/10.;
+                avg = simple.sum/10./simple.count;
+                count = simple.count;
+            }
+
+            void accept(SimpleStatistics simple) {
+                min = Math.min(min, simple.min/10.);
+                max = Math.max(max, simple.max/10.);
+                var nextCount = count + simple.count;
+                avg = (avg * count + simple.sum/10.)/nextCount;
+                count = nextCount;
+            }
+
+            static final DecimalFormat format;
+            static {
+                var decimalFormatSymbols = DecimalFormatSymbols.getInstance();
+                decimalFormatSymbols.setDecimalSeparator('.');
+                format = new DecimalFormat("#0.0", decimalFormatSymbols);
+            }
+            @Override
+            public String toString() {
+                return STR."\{format.format(round(min))}/\{format.format(round(avg))}/\{format.format(round(max))}";
+            }
+
+            static double round(double d) {
+                return Math.round(d*10.)/10.;
+            }
+        }
+        class Name {
+            final int[] data;
+            final int hash;
+            Name(int[] data) {
+                this.data = data;
+                {
+                    var hash = 0;
+                    for (var d : data) {
+                        hash = 31 * hash + d;
+                    }
+                    this.hash = hash;
+                }
+            }
+
+            @Override
+            public int hashCode() {
+                return hash;
+            }
+
+            @Override
+            public boolean equals(Object obj) {
+                if(obj == this) return true;
+                else if(obj instanceof Name name && name.data.length == data.length) {
+                    int size  = 0;
+                    while(size < data.length) {
+                        if(data[size] != name.data[size]) {
+                            return false;
+                        } else size++;
+                    }
+                    return true;
+                } else return false;
+            }
+
+            @Override
+            public String toString() {
+                var bdata = new byte[data.length * 4];
+                int j = 0;
+                for(int i = 0;i < data.length; i++) {
+                    bdata[j++] = (byte)((data[i] >>>  0) & 255);
+                    bdata[j++] = (byte)((data[i] >>>  8) & 255);
+                    bdata[j++] = (byte)((data[i] >>> 16) & 255);
+                    bdata[j++] = (byte)((data[i] >>> 24) & 255);
+                }
+                while(bdata[--j] == 0);
+                return new String(bdata, 0, j+1, StandardCharsets.UTF_8);
+            }
+        }
+
+        record Line(Name name, int value) {}
+
+        var results = new HashMap<Name, Statistics>();
+        try(var file = new RandomAccessFile(Paths.get(FILE).toFile(), "r")) {
+            class Ls implements Iterator<MemorySegment> {
+                final int M = chunkSize;
+                final Arena arena = Arena.ofShared();
+                final long length;
+
+                long offset;
+
+                Ls() throws IOException {
+                    offset = 0L;
+                    length = file.length();
+                }
+
+                @Override
+                public MemorySegment next() {
+                    MemorySegment memorySegment;
+                    try {
+                        memorySegment = file.getChannel().map(
+                                FileChannel.MapMode.READ_ONLY,
+                                offset, Math.min(M + 128L, file.getChannel().size() - offset),
+                                arena
+                        );
+                    } catch (IOException e) {
+                        throw new UncheckedIOException(e);
+                    }
+
+                    var size = M;
+                    if (offset + M < length) {
+                        b:
+                        {
+                            for (int N = 0; N < 128; N++) {
+                                var b = memorySegment.get(JAVA_BYTE, size);
+                                size += 1;
+                                if (b == '\n') {
+                                    break b;
+                                }
+                            }
+                            assert false : "Lines are smaller than 128 bytes";
+                        }
+                        offset += size;
+                    } else {
+                        size = (int) (length - offset);
+                        offset = length;
+                    }
+
+                    return memorySegment.asSlice(0, size);
+                }
+
+                @Override
+                public boolean hasNext() {
+                    return offset < length;
+                }
+            }
+
+            class It implements Iterator<Line> {
+                int offset;
+                final int length;
+                final MemorySegment memorySegment;
+                final ByteOrder endian;
+
+                It(MemorySegment memorySegment) {
+                    offset = 0;
+                    endian = ByteOrder.nativeOrder();
+                    this.memorySegment = memorySegment;
+                    length = (int) memorySegment.byteSize();
+                    assert '\n' == memorySegment.get(JAVA_BYTE, length - 1);
+                }
+
+                @Override
+                public boolean hasNext() {
+                    return offset < length;
+                }
+
+                @Override
+                public Line next() {
+                    int size;
+                    b: {
+                        /*
+                         * Vectorization does not seem to bring anything interesting.
+                         * This is a bit disappointing. What am I doing wrong?
+                         */
+
+                        size = 0;
+
+                        while (offset+size+SPECIES.length() <= length) {
+                            var vector = ByteVector.fromMemorySegment(
+                                    SPECIES, memorySegment,
+                                    offset+size, endian
+                            );
+                            var j = vector.eq((byte) '\n').firstTrue();
+                            if (j < SPECIES.length()) {
+                                assert j >= 0;
+                                size += j;
+                                assert memorySegment.get(JAVA_BYTE, offset+size) == '\n';
+                                break b;
+                            } else {
+                                assert j == SPECIES.length();
+                                size += SPECIES.length();
+                            }
+                        }
+                        {
+                            byte b;
+                            for (; size < 128; size++) {
+                                b = memorySegment.get(JAVA_BYTE, offset+size);
+                                if (b == '\n') break b;
+                            }
+                            assert false : "Lines are smaller than 128 bytes";
+                        }
+                        assert memorySegment.get(JAVA_BYTE, offset+size) == '\n';
+                        assert size < 128;
+                    }
+
+                    Name name;
+                    int value;
+                    {
+                        long cursor = offset+size - 1L;
+                        {
+                            value = memorySegment.get(JAVA_BYTE, cursor) - '0';
+                            value += (memorySegment.get(JAVA_BYTE, cursor-2L) - '0') * 10;
+                            cursor -= 3L;
+                            if (memorySegment.get(JAVA_BYTE, cursor) == '-') {
+                                value *= -1;
+                                cursor -= 1L;
+                            } else if (memorySegment.get(JAVA_BYTE, cursor) != ';') {
+                                value += (memorySegment.get(JAVA_BYTE, cursor) - '0') * 100;
+                                cursor -= 1L;
+                                if (memorySegment.get(JAVA_BYTE, cursor) == '-') {
+                                    value *= -1;
+                                    cursor -= 1L;
+                                }
+                            }
+                        }
+                        //var data = memorySegment.asSlice(offset, cursor-offset).toArray(JAVA_BYTE);
+                        //System.arraycopy(chunk, 0, data, 0, data.length);
+                        //assert ';' != data[data.length - 1];
+                        //name = new Name(data);
+                        {
+                            int mod4StringSize = ((int)(cursor-offset+3))/4 * 4;
+                            var data = memorySegment.asSlice(offset, mod4StringSize).toArray(JAVA_INT_UNALIGNED);
+                            switch(((int)(cursor - offset)) % 4) {
+                                case 0: break;
+                                case 1: {
+                                    data[data.length - 1] &= 255;
+                                } break;
+                                case 2: {
+                                    data[data.length - 1] &= 65535;
+                                } break;
+                                case 3: {
+                                    data[data.length - 1] &= 16777215;
+                                } break;
+                            }
+                            name = new Name(data);
+                        }
+                    }
+                    offset += size + 1;
+                    return new Line(name, value);
+                }
+            }
+
+            record Pair(MemorySegment segment, Map<Name, SimpleStatistics> simple) {
+                Pair(MemorySegment segment) {
+                    this(segment, apply(segment));
+                }
+
+                private static Map<Name, SimpleStatistics> apply(MemorySegment memorySegment) {
+                    try {
+                        return call(memorySegment);
+                    } catch (IOException e) {
+                        throw new UncheckedIOException(e);
+                    }
+                }
+
+                private static Map<Name, SimpleStatistics> call(MemorySegment memorySegment) throws IOException {
+                    var it = new It(memorySegment);
+                    var simple = new HashMap<Name, SimpleStatistics>();
+                    while (it.hasNext()) {
+                        var line = it.next();
+                        var name = line.name();
+                        var value = line.value();
+
+                        var statistics = simple.get(name);
+                        if (statistics == null) {
+                            statistics = new SimpleStatistics();
+                            simple.put(name, statistics);
+                        }
+                        statistics.accept(value);
+                    }
+                    return simple;
+                }
+            }
+
+            var ls = new Ls();
+
+            try(
+                    var nioService = Executors.newVirtualThreadPerTaskExecutor();
+                    var parallelService =Executors.newFixedThreadPool(numberOfParallelWorkers)
+            ) {
+                var tasksQueue = new ArrayList<Future<Pair>>();
+                for(;;) {
+                    assert tasksQueue.size() <= numberOfIOWorkers;
+                    if(tasksQueue.size() < numberOfIOWorkers) {
+                        if(ls.hasNext()) {
+                            var memseg = ls.next();
+                            var task = CompletableFuture.supplyAsync(
+                                    () -> {
+                                        memseg.load();
+                                        return memseg;
+                                    }, nioService
+                            ).thenApplyAsync(Pair::new, parallelService);
+
+                            tasksQueue.add(task);
+                        } else if(tasksQueue.isEmpty()) break;
+                    }
+                    /*
+                     * Wait for the tasks and merge what's ready
+                     */
+                    {
+                        var copy = new ArrayList<Future<Pair>>(tasksQueue.size());
+                        for(var worker: tasksQueue) {
+                            if(worker.isDone()) {
+                                /*
+                                 * Merge the maps
+                                 */
+                                var p = worker.get();
+                                var simple = p.simple();
+                                p.segment().unload();
+                                for (var entry : simple.entrySet()) {
+                                    var name = entry.getKey();
+
+                                    var statistics = results.get(name);
+                                    if (statistics == null) {
+                                        statistics = new Statistics(entry.getValue());
+                                        results.put(name, statistics);
+                                    } else {
+                                        statistics.accept(entry.getValue());
+                                    }
+                                }
+                            } else copy.add(worker);
+                        }
+                        tasksQueue.clear();
+                        tasksQueue.addAll(copy);
+                    }
+                }
+            }
+        }
+
+        /*
+         * Print
+         */
+        {
+            var sortedMap = new TreeMap<String, Statistics>();
+            for(var entry: results.entrySet()) {
+                sortedMap.put(
+                        entry.getKey().toString(),
+                        entry.getValue()
+                );
+            }
+            var joiner = new StringJoiner(", ", "{", "}");
+            for (var entry : sortedMap.entrySet()) {
+                joiner.add(STR. "\{ entry.getKey() }=\{ entry.getValue() }" );
+            }
+            System.out.println(joiner);
+        }
+    }
+}

From 6013760c7865e25e915651c89fec344632aaffbc Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 18:00:25 +0100
Subject: [PATCH 221/268] Leaderboard update

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 27f6e9c4d..4c29c38de 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,8 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
 |   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
-|   | 00:03.718 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | uses Unsafe |
+|   | 00:03.785 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.2-graal | [zerninv](https://github.com/zerninv) | GraalVM native binary, uses Unsafe |
+|   | 00:03.820 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.2-graal | [John Ziamos](https://github.com/iziamos) | GraalVM native binary, uses Unsafe |
 |   | 00:03.824 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.854 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
@@ -72,15 +73,14 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.101 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:04.209 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| 21.0.1-open | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
-|   | 00:04.230 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | uses Unsafe |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
+|   | 00:04.886 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
-|   | 00:05.180 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
 |   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:05.354 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
@@ -112,6 +112,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:08.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_adriacabeza.java)| 21.0.1-graal | [Adrià Cabeza](https://github.com/adriacabeza) |  |
 |   | 00:08.622 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kuduwa_keshavram.java)| 21.0.1-graal | [Keshavram Kuduwa](https://github.com/kuduwa-keshavram) | uses Unsafe |
 |   | 00:08.892 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_fatroom.java)| 21.0.1-open | [Roman Romanchuk](https://github.com/fatroom) |  |
+|   | 00:08.896 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java)| 21.0.1-open | [Andrzej Nestoruk](https://github.com/anestoruk) |  |
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
 |   | 00:09.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielreid.java)| 21.0.1-open | [Gabriel Reid](https://github.com/gabrielreid) |  |
 |   | 00:09.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java)| 21.0.1-graal | [Filip Hrisafov](https://github.com/filiphr) |  |
@@ -134,7 +135,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
 |   | 00:12.495 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_SamuelYvon.java)| 21.0.1-graal | [Samuel Yvon](https://github.com/SamuelYvon) | GraalVM native binary |
 |   | 00:12.568 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_MeanderingProgrammer.java)| 21.0.1-graal | [Vlad](https://github.com/MeanderingProgrammer) |  |
-|   | 00:12.736 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anestoruk.java)| 21.0.1-open | [Andrzej Nestoruk](https://github.com/anestoruk) |  |
 |   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
@@ -142,6 +142,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
 |   | 00:14.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_berry120.java)| 21.0.1-open | [Michael Berry](https://github.com/berry120) |  |
+|   | 00:14.900 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Judekeyser.java)| java | [Judekeyser](https://github.com/Judekeyser) |  |
 |   | 00:15.006 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PawelAdamski.java)| java | [Paweł Adamski](https://github.com/PawelAdamski) |  |
 |   | 00:15.662 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_semotpan.java)| 21.0.1-open | [Serghei Motpan](https://github.com/semotpan) |  |
 |   | 00:16.063 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_makohn.java)| 21.0.1-open | [Marek Kohn](https://github.com/makohn) |  |

From c5b7b19e57624d2c510acc9efe7d5b2884dec9e8 Mon Sep 17 00:00:00 2001
From: Yevhenii Melnyk <melnyk.yevhenii@gmail.com>
Date: Wed, 31 Jan 2024 18:05:09 +0100
Subject: [PATCH 222/268] melgenek: minor improvements (#655)

* melgenek: minor improvements

* More memory
---
 calculate_average_melgenek.sh                 |   4 +-
 .../onebrc/CalculateAverage_melgenek.java     | 110 +++++++++---------
 2 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/calculate_average_melgenek.sh b/calculate_average_melgenek.sh
index e0a88a352..ad709c31b 100755
--- a/calculate_average_melgenek.sh
+++ b/calculate_average_melgenek.sh
@@ -29,8 +29,8 @@ logicalCpuCount=$([ $(uname) = 'Darwin' ] &&
                        sysctl -n hw.logicalcpu_max ||
                        lscpu -p | egrep -v '^#' | wc -l)
 # The required heap is proportional to the number of cores.
-# There's roughly 3.5MB heap per thread required for the 10k problem.
-requiredMemory=$(echo "(l(15 + 3.5 * $logicalCpuCount)/l(2))" | bc -l)
+# There's roughly 6MB heap per thread required for the 10k problem.
+requiredMemory=$(echo "(l(15 + 6 * $logicalCpuCount)/l(2))" | bc -l)
 heapSize=$(echo "scale=0; 2^(($requiredMemory+1)/1)" | bc)
 
 JAVA_OPTS="$JAVA_OPTS -Xms${heapSize}m -Xmx${heapSize}m"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java b/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java
index 133573186..924cf15d8 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java
@@ -15,12 +15,8 @@
  */
 package dev.morling.onebrc;
 
-import jdk.incubator.vector.ByteVector;
-import jdk.incubator.vector.Vector;
-import jdk.incubator.vector.VectorOperators;
-import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.*;
 
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.lang.invoke.MethodHandles;
@@ -30,7 +26,7 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.TreeMap;
-import java.util.concurrent.Executors;
+import java.util.concurrent.*;
 
 /**
  * The implementation:
@@ -47,7 +43,6 @@ public class CalculateAverage_melgenek {
     private static final int CORES_COUNT = Runtime.getRuntime().availableProcessors();
 
     private static final String FILE = "./measurements.txt";
-
     /**
      * This is a prime number that gives pretty
      * <a href="https://vanilla-java.github.io/2018/08/15/Looking-at-randomness-and-performance-for-hash-codes.html">good hash distributions</a>
@@ -63,28 +58,46 @@ public class CalculateAverage_melgenek {
     private static final Vector<Byte> NEWLINE_VECTOR = BYTE_SPECIES.broadcast(NEWLINE);
     private static final Vector<Byte> SEMICOLON_VECTOR = BYTE_SPECIES.broadcast(SEMICOLON);
     private static final int MAX_LINE_LENGTH = 107; // 100 + len(";-11.1\n") = 100+7
-    private static final TreeMap<String, ResultRow> RESULT = new TreeMap<>();
 
     public static void main(String[] args) throws Throwable {
         long totalSize = Files.size(Path.of(FILE));
-        try (var executor = Executors.newFixedThreadPool(CORES_COUNT - 1)) {
-            long chunkSize = Math.max(1, totalSize / CORES_COUNT);
-            long offset = 0;
+        long chunkSize = Math.max(MAX_LINE_LENGTH, totalSize / CORES_COUNT);
+        var result = new TreeMap<String, ResultRow>();
+        try (var executor = Executors.newFixedThreadPool(CORES_COUNT)) {
+            var service = new ExecutorCompletionService<CompositeTable>(executor);
             int i = 0;
-            for (; offset < totalSize && i < CORES_COUNT - 1; i++) {
-                long currentOffset = offset;
+            for (; i * chunkSize < totalSize; i++) {
+                long currentOffset = Math.max(0, i * chunkSize - 1);
                 long maxOffset = Math.min((i + 1) * chunkSize, totalSize);
-                executor.submit(() -> processRange(currentOffset, maxOffset));
-                offset = (i + 1) * chunkSize - 1;
+                service.submit(() -> processRange(currentOffset, maxOffset));
             }
-            if (offset < totalSize) {
-                processRange(offset, totalSize);
+            for (; i > 0; i--) {
+                service.take().get().addRows(result);
             }
         }
-        System.out.println(RESULT);
+        System.out.println(printTree(result));
     }
 
-    private static void processRange(long startOffset, long maxOffset) {
+    private static String printTree(TreeMap<String, ResultRow> result) {
+        var sb = new StringBuilder(50 * result.size());
+        sb.append("{");
+        boolean first = true;
+        for (var entry : result.entrySet()) {
+            if (first) {
+                first = false;
+            }
+            else {
+                sb.append(", ");
+            }
+            sb.append(entry.getKey());
+            sb.append('=');
+            entry.getValue().appendToStringBuffer(sb);
+        }
+        sb.append("}");
+        return sb.toString();
+    }
+
+    private static CompositeTable processRange(long startOffset, long maxOffset) {
         final var table = new CompositeTable();
         try (var file = new BufferedFile(startOffset, maxOffset)) {
             processChunk(file, table);
@@ -92,12 +105,10 @@ private static void processRange(long startOffset, long maxOffset) {
         catch (Exception e) {
             throw new RuntimeException(e);
         }
-        synchronized (RESULT) {
-            table.addRows(RESULT);
-        }
+        return table;
     }
 
-    private static void processChunk(BufferedFile file, CompositeTable table) {
+    private static void processChunk(BufferedFile file, CompositeTable table) throws IOException {
         if (file.offset != 0) {
             file.refillBuffer();
             int newlinePosition = findDelimiter(file, 0, NEWLINE_VECTOR, NEWLINE);
@@ -223,18 +234,18 @@ private static int calculateHash(byte[] buffer, int startPosition, int endPositi
         long hash = 0;
 
         int position = startPosition;
-        for (; position + Long.BYTES <= endPosition; position += Long.BYTES) {
+        for (; position + Long.BYTES < endPosition; position += Long.BYTES) {
             long value = (long) LONG_VIEW.get(buffer, position);
             hash = hash * RANDOM_PRIME + value;
         }
 
-        if (position + Integer.BYTES <= endPosition) {
+        if (position + Integer.BYTES < endPosition) {
             int value = (int) INT_VIEW.get(buffer, position);
             hash = hash * RANDOM_PRIME + value;
             position += Integer.BYTES;
         }
 
-        for (; position <= endPosition; position++) {
+        for (; position < endPosition; position++) {
             hash = hash * RANDOM_PRIME + buffer[position];
         }
         hash = hash * RANDOM_PRIME;
@@ -261,8 +272,6 @@ private static final class LongTable {
          */
         private final long[] buckets = new long[TABLE_CAPACITY * 3];
 
-        int keysCount = 0;
-
         public void add(long str, short value) {
             int hash = calculateLongHash(str);
             int bucketIdx = hash & TABLE_CAPACITY_MASK;
@@ -273,7 +282,6 @@ public void add(long str, short value) {
             }
             else if (bucketStr == 0L) {
                 createBucket(bucketIdx, str, value);
-                keysCount++;
             }
             else {
                 addWithProbing(str, value, (bucketIdx + 1) & TABLE_CAPACITY_MASK);
@@ -290,7 +298,6 @@ private void addWithProbing(long str, short value, int bucketIdx) {
                 }
                 else if (bucketStr == 0L) {
                     createBucket(bucketIdx, str, value);
-                    keysCount++;
                     break;
                 }
                 else {
@@ -367,15 +374,12 @@ private static final class RegularTable {
         private static final int TABLE_CAPACITY_MASK = TABLE_CAPACITY - 1;
         private final Bucket[] buckets = new Bucket[TABLE_CAPACITY];
 
-        int keysCount = 0;
-
         public void add(byte[] data, int start, int stringLength, int hash, short value) {
             int bucketIdx = hash & TABLE_CAPACITY_MASK;
 
             var bucket = buckets[bucketIdx];
             if (bucket == null) {
                 buckets[bucketIdx] = new Bucket(data, start, stringLength, hash, value);
-                keysCount++;
             }
             else if (hash == bucket.hash && bucket.isEqual(data, start, stringLength)) {
                 bucket.update(value);
@@ -391,7 +395,6 @@ private void addWithProbing(byte[] data, int start, int stringLength, int hash,
                 var bucket = buckets[bucketIdx];
                 if (bucket == null) {
                     buckets[bucketIdx] = new Bucket(data, start, stringLength, hash, value);
-                    keysCount++;
                     break;
                 }
                 else if (hash == bucket.hash && bucket.isEqual(data, start, stringLength)) {
@@ -449,6 +452,14 @@ public boolean isEqual(byte[] data, int start, int length) {
                 if (str.length != length)
                     return false;
                 int i = 0;
+                int vectorLoopBound = BYTE_SPECIES.loopBound(str.length);
+                for (; i < vectorLoopBound; i += BYTE_SPECIES_BYTE_SIZE) {
+                    var vector1 = ByteVector.fromArray(BYTE_SPECIES, str, i);
+                    var vector2 = ByteVector.fromArray(BYTE_SPECIES, data, start + i);
+                    var comparisonResult = vector1.compare(VectorOperators.NE, vector2);
+                    if (comparisonResult.anyTrue())
+                        return false;
+                }
                 for (; i + Long.BYTES < str.length; i += Long.BYTES) {
                     long value1 = (long) LONG_VIEW.get(str, i);
                     long value2 = (long) LONG_VIEW.get(data, start + i);
@@ -493,10 +504,12 @@ public void add(long anotherSum, int anotherCount, short anotherMin, short anoth
                 min = anotherMin;
         }
 
-        public String toString() {
-            return Math.round((double) min) / 10.0 + "/"
-                    + Math.round((double) sum / count) / 10.0 + "/"
-                    + Math.round((double) max) / 10.0;
+        public void appendToStringBuffer(StringBuilder sb) {
+            sb.append(Math.round((double) min) / 10.0);
+            sb.append('/');
+            sb.append(Math.round((double) sum / count) / 10.0);
+            sb.append('/');
+            sb.append(Math.round((double) max) / 10.0);
         }
     }
 
@@ -513,30 +526,19 @@ private static final class BufferedFile implements AutoCloseable {
         private final RandomAccessFile file;
         private long offset;
 
-        private BufferedFile(long startOffset, long maxOffset) throws FileNotFoundException {
+        private BufferedFile(long startOffset, long maxOffset) throws IOException {
             this.offset = startOffset;
             this.maxOffset = maxOffset;
             this.file = new RandomAccessFile(FILE, "r");
         }
 
-        private void refillBuffer() {
+        private void refillBuffer() throws IOException {
             int remainingBytes = bufferLimit - bufferPosition;
             if (remainingBytes < MAX_LINE_LENGTH) {
                 bufferPosition = 0;
-                int bytesRead;
-                try {
-                    file.seek(offset);
-                    bytesRead = file.read(buffer, 0, BUFFER_SIZE);
-                }
-                catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
-                if (bytesRead > 0) {
-                    bufferLimit = bytesRead;
-                }
-                else {
-                    bufferLimit = 0;
-                }
+                file.seek(offset);
+                int bytesRead = file.read(buffer, 0, BUFFER_SIZE);
+                bufferLimit = Math.max(bytesRead, 0);
             }
         }
 

From 3cc4fc85d83122eba8944036691d00e195b6aa57 Mon Sep 17 00:00:00 2001
From: Peter Levart <peter.levart@gmail.com>
Date: Wed, 31 Jan 2024 18:07:56 +0100
Subject: [PATCH 223/268] update1: restructuring for better compilation (#661)

---
 calculate_average_plevart.sh                  |   1 +
 prepare_plevart.sh                            |   2 +-
 .../onebrc/CalculateAverage_plevart.java      | 165 +++++++++---------
 3 files changed, 83 insertions(+), 85 deletions(-)

diff --git a/calculate_average_plevart.sh b/calculate_average_plevart.sh
index be195ac08..32cee488a 100755
--- a/calculate_average_plevart.sh
+++ b/calculate_average_plevart.sh
@@ -17,6 +17,7 @@
 
 JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector"
 JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation"
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields"
 JAVA_OPTS="$JAVA_OPTS -XX:InlineSmallCode=15000 -XX:FreqInlineSize=400 -XX:MaxInlineSize=400"
 #JAVA_OPTS="$JAVA_OPTS -XX:+PrintCompilation -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_plevart $*
diff --git a/prepare_plevart.sh b/prepare_plevart.sh
index d2a3c6ba1..5259fbe65 100755
--- a/prepare_plevart.sh
+++ b/prepare_plevart.sh
@@ -16,4 +16,4 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-tem 1>&2
+sdk use java 21.0.2-tem 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java b/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java
index fd42d454f..80c9e892a 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java
@@ -29,6 +29,7 @@
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.Comparator;
+import java.util.Objects;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
@@ -43,9 +44,10 @@ public class CalculateAverage_plevart {
     private static final int INITIAL_TABLE_CAPACITY = 8192;
 
     public static void main(String[] args) throws IOException {
-        var arena = Arena.global();
+        System.setProperty("jdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK", "0");
         try (
-                var channel = (FileChannel) Files.newByteChannel(FILE, StandardOpenOption.READ)) {
+                var channel = (FileChannel) Files.newByteChannel(FILE, StandardOpenOption.READ);
+                var arena = Arena.ofShared()) {
             var segment = channel.map(FileChannel.MapMode.READ_ONLY, 0, Files.size(FILE), arena);
             int regions = Runtime.getRuntime().availableProcessors();
             IntStream
@@ -54,7 +56,6 @@ public static void main(String[] args) throws IOException {
                     .mapToObj(r -> calculateRegion(segment, regions, r))
                     .reduce(StatsTable::reduce)
                     .ifPresent(System.out::println);
-            segment.unload();
         }
     }
 
@@ -68,14 +69,12 @@ private static StatsTable calculateRegion(MemorySegment segment, int regions, in
             end = skipPastNl(segment, end);
         }
 
-        var stats = new StatsTable(segment, INITIAL_TABLE_CAPACITY);
-        calculateAdjustedRegion(segment, start, end, stats);
-        return stats;
+        return calculateAdjustedRegion(segment, start, end);
     }
 
     private static long skipPastNl(MemorySegment segment, long i) {
         int skipped = 0;
-        while (skipped++ < MAX_LINE_LEN && getByte(segment, i++) != '\n') {
+        while (skipped++ < MAX_LINE_LEN && segment.get(ValueLayout.JAVA_BYTE, i++) != '\n') {
         }
         if (skipped > MAX_LINE_LEN) {
             throw new IllegalArgumentException(
@@ -84,27 +83,28 @@ private static long skipPastNl(MemorySegment segment, long i) {
         return i;
     }
 
-    private static void calculateAdjustedRegion(MemorySegment segment, long start, long end, StatsTable stats) {
+    private static StatsTable calculateAdjustedRegion(MemorySegment segment, long start, long end) {
+        var stats = new StatsTable(segment, INITIAL_TABLE_CAPACITY);
+
         var species = ByteVector.SPECIES_PREFERRED;
-        long speciesByteSize = species.vectorByteSize();
 
         long cityStart = start, numberStart = 0;
         int cityLen = 0;
 
         for (long i = start, j = i; i < end; j = i) {
             long semiNlSet;
-            if (end - i >= speciesByteSize) {
+            if (end - i >= species.vectorByteSize()) {
                 var vec = ByteVector.fromMemorySegment(species, segment, i, ByteOrder.nativeOrder());
                 semiNlSet = vec.compare(VectorOperators.EQ, (byte) ';')
                         .or(vec.compare(VectorOperators.EQ, (byte) '\n'))
                         .toLong();
-                i += speciesByteSize;
+                i += species.vectorByteSize();
             }
             else { // tail, smaller than speciesByteSize
                 semiNlSet = 0;
                 long mask = 1;
                 while (i < end && mask != 0) {
-                    int c = getByte(segment, i++);
+                    int c = segment.get(ValueLayout.JAVA_BYTE, i++);
                     if (c == '\n' || c == ';') {
                         semiNlSet |= mask;
                     }
@@ -120,63 +120,17 @@ private static void calculateAdjustedRegion(MemorySegment segment, long start, l
                 }
                 else { // nl
                     int numberLen = (int) (j - numberStart);
-                    calculateEntry(segment, cityStart, cityLen, numberStart, numberLen, stats);
+                    stats.calculateEntry(cityStart, cityLen, numberStart, numberLen);
                     cityStart = ++j;
                     numberStart = 0;
                 }
             }
         }
-    }
 
-    private static void calculateEntry(MemorySegment segment, long cityStart, int cityLen, long numberStart, int numberLen, StatsTable stats) {
-        int hash = StatsTable.hash(segment, cityStart, cityLen);
-        int number = parseNumber(segment, numberStart, numberLen);
-        stats.aggregate(cityStart, cityLen, hash, 1, number, number, number);
-    }
-
-    private static int parseNumber(MemorySegment segment, long off, int len) {
-        int c0 = getByte(segment, off);
-        int d0;
-        int sign;
-        if (c0 == '-') {
-            off++;
-            len--;
-            d0 = getByte(segment, off) - '0';
-            sign = -1;
-        } else {
-            d0 = c0 - '0';
-            sign = 1;
-        }
-        return sign * switch (len) {
-            case 1 -> d0 * 10;                  // 9
-            case 2 -> {
-                int d1 = getByte(segment, off + 1) - '0';
-                yield d0 * 100 + d1 * 10;       // 99
-            }
-            case 3 -> {
-                int d2 = getByte(segment, off + 2) - '0';
-                yield d0 * 10 + d2;             // 9.9
-            }
-            case 4 -> {
-                int d1 = getByte(segment, off + 1) - '0';
-                int d3 = getByte(segment, off + 3) - '0';
-                yield d0 * 100 + d1 * 10 + d3;  // 99.9
-            }
-            default -> {
-                throw new IllegalArgumentException("Invalid number: " + getString(segment, off, len));
-            }
-        };
-    }
-
-    private static int getByte(MemorySegment segment, long off) {
-        return segment.get(ValueLayout.JAVA_BYTE, off);
-    }
-
-    private static String getString(MemorySegment segment, long off, int len) {
-        return new String(segment.asSlice(off, len).toArray(ValueLayout.JAVA_BYTE), StandardCharsets.UTF_8);
+        return stats;
     }
 
-    final static class StatsTable implements Cloneable {
+    final static class StatsTable {
         private static final int LOAD_FACTOR = 16;
         // offsets of fields
         private static final int _lenHash = 0,
@@ -190,7 +144,7 @@ final static class StatsTable implements Cloneable {
         private long[] table;
 
         StatsTable(MemorySegment segment, int capacity) {
-            this.segment = segment;
+            this.segment = Objects.requireNonNull(segment);
             int pow2cap = Integer.highestOneBit(capacity);
             if (pow2cap < capacity) {
                 pow2cap <<= 1;
@@ -199,6 +153,13 @@ final static class StatsTable implements Cloneable {
             this.table = new long[idx(pow2cap)];
         }
 
+        private StatsTable(StatsTable st) {
+            this.segment = st.segment;
+            this.pow2cap = st.pow2cap;
+            this.loadedSize = st.loadedSize;
+            this.table = st.table;
+        }
+
         private static int idx(int i) {
             return i << 3;
         }
@@ -237,7 +198,49 @@ private static int hash(long lenHash) {
             }
         }
 
-        static int hash(MemorySegment segment, long off, int len) {
+        void calculateEntry(long cityStart, int cityLen, long numberStart, int numberLen) {
+            int hash = hash(cityStart, cityLen);
+            int number = parseNumber(numberStart, numberLen);
+            aggregate(cityStart, cityLen, hash, 1, number, number, number);
+        }
+
+        int parseNumber(long off, int len) {
+            int c0 = segment.get(ValueLayout.JAVA_BYTE, off);
+            int d0;
+            int sign;
+            if (c0 == '-') {
+                off++;
+                len--;
+                d0 = segment.get(ValueLayout.JAVA_BYTE, off) - '0';
+                sign = -1;
+            } else {
+                d0 = c0 - '0';
+                sign = 1;
+            }
+            return sign * switch (len) {
+                case 1 -> d0 * 10;                  // 9
+                case 2 -> {
+                    int d1 = segment.get(ValueLayout.JAVA_BYTE, off + 1) - '0';
+                    yield d0 * 100 + d1 * 10;       // 99
+                }
+                case 3 -> {
+                    int d2 = segment.get(ValueLayout.JAVA_BYTE, off + 2) - '0';
+                    yield d0 * 10 + d2;             // 9.9
+                }
+                case 4 -> {
+                    int d1 = segment.get(ValueLayout.JAVA_BYTE, off + 1) - '0';
+                    int d3 = segment.get(ValueLayout.JAVA_BYTE, off + 3) - '0';
+                    yield d0 * 100 + d1 * 10 + d3;  // 99.9
+                }
+                default ->
+                    throw new IllegalArgumentException(
+                        "Invalid number: " +
+                        new String(segment.asSlice(off, len).toArray(ValueLayout.JAVA_BYTE), StandardCharsets.UTF_8)
+                    );
+            };
+        }
+
+        int hash(long off, int len) {
             if (len > Integer.BYTES) {
                 int head = segment.get(ValueLayout.JAVA_INT_UNALIGNED, off);
                 int tail = segment.get(ValueLayout.JAVA_INT_UNALIGNED, off + len - Integer.BYTES);
@@ -251,7 +254,11 @@ static int hash(MemorySegment segment, long off, int len) {
             }
         }
 
-        static boolean equals(MemorySegment segment, long off1, long off2, int len) {
+        private static boolean bothLessThan(long a, long b, long threshold) {
+            return (a < threshold) && (b < threshold);
+        }
+
+        boolean equals(long off1, long off2, int len) {
             while (len >= Long.BYTES) {
                 if (segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off1) != segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off2)) {
                     return false;
@@ -261,16 +268,16 @@ static boolean equals(MemorySegment segment, long off1, long off2, int len) {
                 len -= Long.BYTES;
             }
             // still enough memory to compare two longs, but masked?
-            if (Math.max(off1, off2) + Long.BYTES <= segment.byteSize()) {
+            if (bothLessThan(off1, off2, segment.byteSize() - Long.BYTES + 1)) {
                 long mask = LEN_LONG_MASK[len];
                 return (segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off1) & mask) == (segment.get(ValueLayout.JAVA_LONG_UNALIGNED, off2) & mask);
             }
             else {
-                return equalsAtBorder(segment, off1, off2, len);
+                return equalsAtBorder(off1, off2, len);
             }
         }
 
-        private static boolean equalsAtBorder(MemorySegment segment, long off1, long off2, int len) {
+        private boolean equalsAtBorder(long off1, long off2, int len) {
             if (len > Integer.BYTES) {
                 if (segment.get(ValueLayout.JAVA_INT_UNALIGNED, off1) != segment.get(ValueLayout.JAVA_INT_UNALIGNED, off2)) {
                     return false;
@@ -290,7 +297,7 @@ void aggregate(
                        // key
                        long off, int len, int hash,
                        // value
-                       long count, long sum, long min, long max) {
+                       long count, long sum, int min, int max) {
             long lenHash = lenHash(len, hash);
             int mask = pow2cap - 1;
             for (int i = hash & mask, probe = 0; probe < pow2cap; i = (i + 1) & mask, probe++) {
@@ -309,11 +316,11 @@ void aggregate(
                     }
                     return;
                 }
-                if (lenHash_i == lenHash && equals(segment, table[idx + _off], off, len)) {
+                if (lenHash_i == lenHash && equals(off, table[idx + _off], len)) {
                     table[idx + _count] += count;
                     table[idx + _sum] += sum;
-                    table[idx + _min] = Math.min(min, table[idx + _min]);
-                    table[idx + _max] = Math.max(max, table[idx + _max]);
+                    table[idx + _min] = Math.min(min, (int) table[idx + _min]);
+                    table[idx + _max] = Math.max(max, (int) table[idx + _max]);
                     return;
                 }
             }
@@ -325,7 +332,7 @@ private void grow() {
                 throw new OutOfMemoryError("StatsTable capacity exceeded");
             }
             else {
-                var oldStats = clone();
+                var oldStats = new StatsTable(this);
                 pow2cap <<= 1;
                 table = new long[idx(pow2cap)];
                 loadedSize = 0;
@@ -333,16 +340,6 @@ private void grow() {
             }
         }
 
-        @Override
-        protected StatsTable clone() {
-            try {
-                return (StatsTable) super.clone();
-            }
-            catch (CloneNotSupportedException e) {
-                throw new InternalError(e);
-            }
-        }
-
         StatsTable reduce(StatsTable other) {
             other
                     .idxStream()
@@ -353,8 +350,8 @@ StatsTable reduce(StatsTable other) {
                                     hash(other.table[idx + _lenHash]),
                                     other.table[idx + _count],
                                     other.table[idx + _sum],
-                                    other.table[idx + _min],
-                                    other.table[idx + _max]));
+                                    (int) other.table[idx + _min],
+                                    (int) other.table[idx + _max]));
             return this;
         }
 

From b529ef2a59c7df8f435ff7b1fc91362457180498 Mon Sep 17 00:00:00 2001
From: Roman Stoffel <roman.stoffel@gamlor.info>
Date: Wed, 31 Jan 2024 18:13:08 +0100
Subject: [PATCH 224/268] Gamlerhart Last Update: Disabling GC (#636)

* Disable The GC

Cuts off sometimes up to 1 seconds
of runtime on my machine.

* Remove Confusing Byte-Order Parameter

Bytes have no Byte-Order ;)

* Provide More Memory to Run the 10K set

* Fix Comparison Function
---
 calculate_average_gamlerhart.sh                          | 2 +-
 .../dev/morling/onebrc/CalculateAverage_gamlerhart.java  | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/calculate_average_gamlerhart.sh b/calculate_average_gamlerhart.sh
index c52a25bfb..7427ea657 100755
--- a/calculate_average_gamlerhart.sh
+++ b/calculate_average_gamlerhart.sh
@@ -15,5 +15,5 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector"
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC -Xmx512m -Xlog:all=error"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gamlerhart
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java
index 5d0a4bdbb..4d44494b9 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java
@@ -31,8 +31,7 @@
 
 import static java.lang.Double.doubleToRawLongBits;
 import static java.lang.Double.longBitsToDouble;
-import static java.lang.foreign.ValueLayout.JAVA_BYTE;
-import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED;
+import static java.lang.foreign.ValueLayout.*;
 
 /**
  * Broad experiments in this implementation:
@@ -242,7 +241,7 @@ else if (isSameEntry(file, slotEntry, pos, len)) {
         private boolean isSameEntry(MemorySegment file, long slotEntry, long pos, int len) {
             long keyPos = (slotEntry & MASK_POS) >> SHIFT_POS;
             int keyLen = (int) (slotEntry & MASK_LEN);
-            var isSame = isSame(file, keyPos, pos, len);
+            var isSame = len == keyLen && isSame(file, keyPos, pos, len);
             return isSame;
         }
 
@@ -251,8 +250,8 @@ private static boolean isSame(MemorySegment file, long i1, long i2, int len) {
             var i1len = i1 + vecLen;
             var i2len = i2 + vecLen;
             if (len < vecLen && i1len <= file.byteSize() && i2len <= file.byteSize()) {
-                var v1 = byteVec.fromMemorySegment(file, i1, ByteOrder.BIG_ENDIAN);
-                var v2 = byteVec.fromMemorySegment(file, i2, ByteOrder.BIG_ENDIAN);
+                var v1 = byteVec.fromMemorySegment(file, i1, ByteOrder.nativeOrder());
+                var v2 = byteVec.fromMemorySegment(file, i2, ByteOrder.nativeOrder());
                 var isTrue = v1.compare(VectorOperators.EQ, v2, allTrue.indexInRange(0, len));
                 return isTrue.trueCount() == len;
             }

From 9b9bb8ed3f6ed7f754895e1754a3383c3379a9e2 Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Wed, 31 Jan 2024 17:17:34 +0000
Subject: [PATCH 225/268] jerrinot - final(?) improvements (#690)

* decrease instruction level parallelism

it turns out doing 2 things was too much. perf annotate showed spilling.

* more trickery with latency hiding

* work-stealing, lookp tables, credits

* do not assume gender
---
 .../onebrc/CalculateAverage_jerrinot.java     | 347 +++++++++---------
 1 file changed, 182 insertions(+), 165 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index df5defe71..6997f4896 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -24,6 +24,7 @@
 import java.lang.reflect.Field;
 import java.nio.channels.FileChannel.MapMode;
 import java.util.*;
+import java.util.concurrent.atomic.AtomicLong;
 
 /**
  * I figured out it would be very hard to win the main competition of the One Billion Rows Challenge.
@@ -31,17 +32,59 @@
  *
  * Anyway, if you can make sense out of not exactly idiomatic Java code, and you enjoy pushing performance limits
  * then QuestDB - the fastest open-source time-series database - is hiring: https://questdb.io/careers/core-database-engineer/
- *
+ * <p>
+ * <b>Credit</b>
+ * <p>
+ * I stand on shoulders of giants. I wouldn't be able to code this without analyzing and borrowing from solutions of others.
+ * People who helped me the most:
+ * <ul>
+ * <li>Thomas Wuerthinger (thomaswue): The munmap() trick and work-stealing. In both cases, I shameless copy-pasted their code.
+ *     Including SWAR for detecting new lines. Thomas also gave me helpful hints on how to detect register spilling issues.</li>
+ * <li>Quan Anh Mai (merykitty): I borrowed their phenomenal branch-free parser.</li>
+ * <li>Marko Topolnik (mtopolnik): I use a hashing function I saw in his code. It seems the produce good quality hashes
+ *     and it's next-level in speed. Marko joined the challenge before me and our discussions made me to join too!</li>
+ * <li>Van Phu DO (abeobk): I saw the idea with simple lookup tables instead of complicated bit-twiddling in their code first.</li>
+ * <li>Roy van Rijn (royvanrijn): I borrowed their SWAR code and initially their hash code impl</li>
+ * <li>Francesco Nigro (franz1981): For our online discussions about performance. Both before and during this challenge.
+ *     Francesco gave me the idea to check register spilling.</li>
+ * </ul>
  */
 public class CalculateAverage_jerrinot {
     private static final Unsafe UNSAFE = unsafe();
     private static final String MEASUREMENTS_TXT = "measurements.txt";
     // todo: with hyper-threading enable we would be better of with availableProcessors / 2;
     // todo: validate the testing env. params.
-    private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
-    // private static final int THREAD_COUNT = 4;
+    private static final int EXTRA_THREAD_COUNT = Runtime.getRuntime().availableProcessors() - 1;
+    // private static final int THREAD_COUNT = 1;
 
     private static final long SEPARATOR_PATTERN = 0x3B3B3B3B3B3B3B3BL;
+    private static final long NEW_LINE_PATTERN = 0x0A0A0A0A0A0A0A0AL;
+    private static final int SEGMENT_SIZE = 4 * 1024 * 1024;
+
+    // credits for the idea with lookup tables instead of bit-shifting: abeobk
+    private static final long[] HASH_MASKS = new long[]{
+            0x0000000000000000L, // semicolon is the first char
+            0x00000000000000ffL,
+            0x000000000000ffffL,
+            0x0000000000ffffffL,
+            0x00000000ffffffffL,
+            0x000000ffffffffffL,
+            0x0000ffffffffffffL,
+            0x00ffffffffffffffL, // semicolon is the last char
+            0xffffffffffffffffL // there is no semicolon at all
+    };
+
+    private static final long[] ADVANCE_MASKS = new long[]{
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0x0000000000000000L,
+            0xffffffffffffffffL,
+    };
 
     private static Unsafe unsafe() {
         try {
@@ -81,56 +124,29 @@ private static void spawnWorker() throws IOException {
     static void calculate() throws Exception {
         final File file = new File(MEASUREMENTS_TXT);
         final long length = file.length();
-        // final int chunkCount = Runtime.getRuntime().availableProcessors();
-        int chunkPerThread = 3;
-        final int chunkCount = THREAD_COUNT * chunkPerThread;
-        final var chunkStartOffsets = new long[chunkCount + 1];
         try (var raf = new RandomAccessFile(file, "r")) {
-            // credit - chunking code: mtopolnik
-            final var inputBase = raf.getChannel().map(MapMode.READ_ONLY, 0, length, Arena.global()).address();
-            for (int i = 1; i < chunkStartOffsets.length - 1; i++) {
-                var start = length * i / (chunkStartOffsets.length - 1);
-                raf.seek(start);
-                while (raf.read() != (byte) '\n') {
-                }
-                start = raf.getFilePointer();
-                chunkStartOffsets[i] = start + inputBase;
-            }
-            chunkStartOffsets[0] = inputBase;
-            chunkStartOffsets[chunkCount] = inputBase + length;
+            long fileStart = raf.getChannel().map(MapMode.READ_ONLY, 0, length, Arena.global()).address();
+            long fileEnd = fileStart + length;
+            var globalCursor = new AtomicLong(fileStart);
 
-            Processor[] processors = new Processor[THREAD_COUNT];
-            Thread[] threads = new Thread[THREAD_COUNT];
+            Processor[] processors = new Processor[EXTRA_THREAD_COUNT];
+            Thread[] threads = new Thread[EXTRA_THREAD_COUNT];
 
-            for (int i = 0; i < THREAD_COUNT - 1; i++) {
-                long startA = chunkStartOffsets[i * chunkPerThread];
-                long endA = chunkStartOffsets[i * chunkPerThread + 1];
-                long startB = chunkStartOffsets[i * chunkPerThread + 1];
-                long endB = chunkStartOffsets[i * chunkPerThread + 2];
-                long startC = chunkStartOffsets[i * chunkPerThread + 2];
-                long endC = chunkStartOffsets[i * chunkPerThread + 3];
-
-                Processor processor = new Processor(startA, endA, startB, endB, startC, endC);
-                processors[i] = processor;
+            for (int i = 0; i < EXTRA_THREAD_COUNT; i++) {
+                Processor processor = new Processor(fileStart, fileEnd, globalCursor);
                 Thread thread = new Thread(processor);
+                processors[i] = processor;
                 threads[i] = thread;
                 thread.start();
             }
 
-            int ownIndex = THREAD_COUNT - 1;
-            long startA = chunkStartOffsets[ownIndex * chunkPerThread];
-            long endA = chunkStartOffsets[ownIndex * chunkPerThread + 1];
-            long startB = chunkStartOffsets[ownIndex * chunkPerThread + 1];
-            long endB = chunkStartOffsets[ownIndex * chunkPerThread + 2];
-            long startC = chunkStartOffsets[ownIndex * chunkPerThread + 2];
-            long endC = chunkStartOffsets[ownIndex * chunkPerThread + 3];
-            Processor processor = new Processor(startA, endA, startB, endB, startC, endC);
+            Processor processor = new Processor(fileStart, fileEnd, globalCursor);
             processor.run();
 
-            var accumulator = new TreeMap<String, Processor.StationStats>();
+            var accumulator = new TreeMap<String, StationStats>();
             processor.accumulateStatus(accumulator);
 
-            for (int i = 0; i < THREAD_COUNT - 1; i++) {
+            for (int i = 0; i < EXTRA_THREAD_COUNT; i++) {
                 Thread t = threads[i];
                 t.join();
                 processors[i].accumulateStatus(accumulator);
@@ -140,10 +156,10 @@ static void calculate() throws Exception {
         }
     }
 
-    private static void printResults(TreeMap<String, Processor.StationStats> accumulator) {
+    private static void printResults(TreeMap<String, StationStats> accumulator) {
         var sb = new StringBuilder(10000);
         boolean first = true;
-        for (Map.Entry<String, Processor.StationStats> statsEntry : accumulator.entrySet()) {
+        for (Map.Entry<String, StationStats> statsEntry : accumulator.entrySet()) {
             if (first) {
                 sb.append("{");
                 first = false;
@@ -210,20 +226,17 @@ private static class Processor implements Runnable {
         private static final int FAST_MAP_SIZE_BYTES = MAPS_SLOT_COUNT * FAST_MAP_ENTRY_SIZE_BYTES;
         private static final int SLOW_MAP_MAP_NAMES_BYTES = MAX_UNIQUE_KEYS * STATION_MAX_NAME_BYTES;
         private static final int MAP_MASK = MAPS_SLOT_COUNT - 1;
+        private final AtomicLong globalCursor;
 
         private long slowMap;
         private long slowMapNamesPtr;
-        private long slowMapNamesLo;
-        // private long fastMap;
         private long cursorA;
         private long endA;
         private long cursorB;
         private long endB;
-        private long cursorC;
-        private long endC;
-        private HashMap<String, StationStats> stats = new HashMap<>(1000);
-
-        // private long maxClusterLen;
+        private HashMap<String, CalculateAverage_jerrinot.StationStats> stats = new HashMap<>(1000);
+        private final long fileEnd;
+        private final long fileStart;
 
         // credit: merykitty
         private long parseAndStoreTemperature(long startCursor, long baseEntryPtr, long word) {
@@ -264,20 +277,12 @@ private static long getDelimiterMask(final long word) {
             return (match - 0x0101010101010101L) & (~match & 0x8080808080808080L);
         }
 
-        // todo: immutability cost us in allocations, but that's probably peanuts in the grand scheme of things. still worth checking
-        // maybe JVM trusting Final in Records offsets it ..a test is needed
-        record StationStats(int min, int max, int count, long sum) {
-            StationStats mergeWith(StationStats other) {
-                return new StationStats(Math.min(min, other.min), Math.max(max, other.max), count + other.count, sum + other.sum);
-            }
-        }
-
-        void accumulateStatus(TreeMap<String, StationStats> accumulator) {
-            for (Map.Entry<String, StationStats> entry : stats.entrySet()) {
+        void accumulateStatus(TreeMap<String, CalculateAverage_jerrinot.StationStats> accumulator) {
+            for (Map.Entry<String, CalculateAverage_jerrinot.StationStats> entry : stats.entrySet()) {
                 String name = entry.getKey();
-                StationStats localStats = entry.getValue();
+                CalculateAverage_jerrinot.StationStats localStats = entry.getValue();
 
-                StationStats globalStats = accumulator.get(name);
+                CalculateAverage_jerrinot.StationStats globalStats = accumulator.get(name);
                 if (globalStats == null) {
                     accumulator.put(name, localStats);
                 }
@@ -287,24 +292,10 @@ void accumulateStatus(TreeMap<String, StationStats> accumulator) {
             }
         }
 
-        Processor(long startA, long endA, long startB, long endB, long startC, long endC) {
-            this.cursorA = startA;
-            this.cursorB = startB;
-            this.cursorC = startC;
-            this.endA = endA;
-            this.endB = endB;
-            this.endC = endC;
-        }
-
-        private void doTail(long fastMAp) {
-            doOne(cursorA, endA);
-            doOne(cursorB, endB);
-            doOne(cursorC, endC);
-
-            transferToHeap(fastMAp);
-            // UNSAFE.freeMemory(fastMap);
-            // UNSAFE.freeMemory(slowMap);
-            // UNSAFE.freeMemory(slowMapNamesLo);
+        Processor(long fileStart, long fileEnd, AtomicLong globalCursor) {
+            this.globalCursor = globalCursor;
+            this.fileEnd = fileEnd;
+            this.fileStart = fileStart;
         }
 
         private void transferToHeap(long fastMap) {
@@ -324,7 +315,7 @@ private void transferToHeap(long fastMap) {
                 int count = UNSAFE.getInt(baseAddress + MAP_COUNT_OFFSET);
                 long sum = UNSAFE.getLong(baseAddress + MAP_SUM_OFFSET);
 
-                stats.put(name, new StationStats(min, max, count, sum));
+                stats.put(name, new CalculateAverage_jerrinot.StationStats(min, max, count, sum));
             }
 
             for (long baseAddress = fastMap; baseAddress < fastMap + FAST_MAP_SIZE_BYTES; baseAddress += FAST_MAP_ENTRY_SIZE_BYTES) {
@@ -345,16 +336,21 @@ private void transferToHeap(long fastMap) {
 
                 var v = stats.get(name);
                 if (v == null) {
-                    stats.put(name, new StationStats(min, max, count, sum));
+                    stats.put(name, new CalculateAverage_jerrinot.StationStats(min, max, count, sum));
                 }
                 else {
-                    stats.put(name, new StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
+                    stats.put(name, new CalculateAverage_jerrinot.StationStats(Math.min(v.min, min), Math.max(v.max, max), v.count + count, v.sum + sum));
                 }
             }
         }
 
-        private void doOne(long cursor, long endA) {
-            while (cursor < endA) {
+        private void doOne(long cursor, long end) {
+            while (cursor < end) {
+                // it seems that when pulling just from a single chunk
+                // then bit-twiddling is faster than lookup tables
+                // hypothesis: when processing multiple things at once then LOAD latency is partially hidden
+                // but when processing just one thing then it's better to keep things local as much as possible? maybe:)
+
                 long start = cursor;
                 long currentWord = UNSAFE.getLong(cursor);
                 long mask = getDelimiterMask(currentWord);
@@ -392,135 +388,139 @@ private static int hash(long word) {
             return (int) hash;
         }
 
+        private static long nextNewLine(long prev) {
+            // again: credits to @thomaswue for this code, literally copy'n'paste
+            while (true) {
+                long currentWord = UNSAFE.getLong(prev);
+                long input = currentWord ^ NEW_LINE_PATTERN;
+                long pos = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
+                if (pos != 0) {
+                    prev += Long.numberOfTrailingZeros(pos) >>> 3;
+                    break;
+                }
+                else {
+                    prev += 8;
+                }
+            }
+            return prev;
+        }
+
         @Override
         public void run() {
+            long fastMap = allocateMem();
+            for (;;) {
+                long startingPtr = globalCursor.addAndGet(SEGMENT_SIZE) - SEGMENT_SIZE;
+                if (startingPtr >= fileEnd) {
+                    break;
+                }
+                setCursors(startingPtr);
+                mainLoop(fastMap);
+                doOne(cursorA, endA);
+                doOne(cursorB, endB);
+            }
+            transferToHeap(fastMap);
+        }
+
+        private long allocateMem() {
             this.slowMap = UNSAFE.allocateMemory(SLOW_MAP_SIZE_BYTES);
             this.slowMapNamesPtr = UNSAFE.allocateMemory(SLOW_MAP_MAP_NAMES_BYTES);
-            this.slowMapNamesLo = slowMapNamesPtr;
             long fastMap = UNSAFE.allocateMemory(FAST_MAP_SIZE_BYTES);
             UNSAFE.setMemory(slowMap, SLOW_MAP_SIZE_BYTES, (byte) 0);
             UNSAFE.setMemory(fastMap, FAST_MAP_SIZE_BYTES, (byte) 0);
             UNSAFE.setMemory(slowMapNamesPtr, SLOW_MAP_MAP_NAMES_BYTES, (byte) 0);
+            return fastMap;
+        }
 
-            while (cursorA < endA && cursorB < endB && cursorC < endC) {
+        private void mainLoop(long fastMap) {
+            while (cursorA < endA && cursorB < endB) {
                 long currentWordA = UNSAFE.getLong(cursorA);
                 long currentWordB = UNSAFE.getLong(cursorB);
-                long currentWordC = UNSAFE.getLong(cursorC);
 
-                long startA = cursorA;
-                long startB = cursorB;
-                long startC = cursorC;
+                long delimiterMaskA = getDelimiterMask(currentWordA);
+                long delimiterMaskB = getDelimiterMask(currentWordB);
 
-                long maskA = getDelimiterMask(currentWordA);
-                long maskB = getDelimiterMask(currentWordB);
-                long maskC = getDelimiterMask(currentWordC);
+                long candidateWordA = UNSAFE.getLong(cursorA + 8);
+                long candidateWordB = UNSAFE.getLong(cursorB + 8);
 
-                long maskComplementA = -maskA;
-                long maskComplementB = -maskB;
-                long maskComplementC = -maskC;
+                long startA = cursorA;
+                long startB = cursorB;
 
-                long maskWithDelimiterA = (maskA ^ (maskA - 1));
-                long maskWithDelimiterB = (maskB ^ (maskB - 1));
-                long maskWithDelimiterC = (maskC ^ (maskC - 1));
+                int trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
+                int trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
 
-                long isMaskZeroA = (((maskA | maskComplementA) >>> 63) ^ 1);
-                long isMaskZeroB = (((maskB | maskComplementB) >>> 63) ^ 1);
-                long isMaskZeroC = (((maskC | maskComplementC) >>> 63) ^ 1);
+                long advanceMaskA = ADVANCE_MASKS[trailingZerosA];
+                long advanceMaskB = ADVANCE_MASKS[trailingZerosB];
 
-                cursorA += isMaskZeroA << 3;
-                cursorB += isMaskZeroB << 3;
-                cursorC += isMaskZeroC << 3;
+                long wordMaskA = HASH_MASKS[trailingZerosA];
+                long wordMaskB = HASH_MASKS[trailingZerosB];
 
-                long nextWordA = UNSAFE.getLong(cursorA);
-                long nextWordB = UNSAFE.getLong(cursorB);
-                long nextWordC = UNSAFE.getLong(cursorC);
+                long negAdvanceMaskA = ~advanceMaskA;
+                long negAdvanceMaskB = ~advanceMaskB;
 
-                long firstWordMaskA = maskWithDelimiterA >>> 8;
-                long firstWordMaskB = maskWithDelimiterB >>> 8;
-                long firstWordMaskC = maskWithDelimiterC >>> 8;
+                cursorA += advanceMaskA & 8;
+                cursorB += advanceMaskB & 8;
 
-                long nextMaskA = getDelimiterMask(nextWordA);
-                long nextMaskB = getDelimiterMask(nextWordB);
-                long nextMaskC = getDelimiterMask(nextWordC);
+                long nextWordA = (advanceMaskA & candidateWordA) | (negAdvanceMaskA & currentWordA);
+                long nextWordB = (advanceMaskB & candidateWordB) | (negAdvanceMaskB & currentWordB);
 
-                boolean slowA = nextMaskA == 0;
-                boolean slowB = nextMaskB == 0;
-                boolean slowC = nextMaskC == 0;
-                boolean slowSome = (slowA || slowB || slowC);
+                long nextDelimiterMaskA = getDelimiterMask(nextWordA);
+                long nextDelimiterMaskB = getDelimiterMask(nextWordB);
 
-                long extA = -isMaskZeroA;
-                long extB = -isMaskZeroB;
-                long extC = -isMaskZeroC;
+                boolean slowA = nextDelimiterMaskA == 0;
+                boolean slowB = nextDelimiterMaskB == 0;
+                boolean slowSome = (slowA || slowB);
 
-                long maskedFirstWordA = (extA | firstWordMaskA) & currentWordA;
-                long maskedFirstWordB = (extB | firstWordMaskB) & currentWordB;
-                long maskedFirstWordC = (extC | firstWordMaskC) & currentWordC;
+                long maskedFirstWordA = wordMaskA & currentWordA;
+                long maskedFirstWordB = wordMaskB & currentWordB;
 
                 int hashA = hash(maskedFirstWordA);
                 int hashB = hash(maskedFirstWordB);
-                int hashC = hash(maskedFirstWordC);
 
                 currentWordA = nextWordA;
                 currentWordB = nextWordB;
-                currentWordC = nextWordC;
 
-                maskA = nextMaskA;
-                maskB = nextMaskB;
-                maskC = nextMaskC;
+                delimiterMaskA = nextDelimiterMaskA;
+                delimiterMaskB = nextDelimiterMaskB;
                 if (slowSome) {
-                    while (maskA == 0) {
+                    while (delimiterMaskA == 0) {
                         cursorA += 8;
                         currentWordA = UNSAFE.getLong(cursorA);
-                        maskA = getDelimiterMask(currentWordA);
+                        delimiterMaskA = getDelimiterMask(currentWordA);
                     }
 
-                    while (maskB == 0) {
+                    while (delimiterMaskB == 0) {
                         cursorB += 8;
                         currentWordB = UNSAFE.getLong(cursorB);
-                        maskB = getDelimiterMask(currentWordB);
-                    }
-                    while (maskC == 0) {
-                        cursorC += 8;
-                        currentWordC = UNSAFE.getLong(cursorC);
-                        maskC = getDelimiterMask(currentWordC);
+                        delimiterMaskB = getDelimiterMask(currentWordB);
                     }
                 }
 
-                final int delimiterByteA = Long.numberOfTrailingZeros(maskA);
-                final int delimiterByteB = Long.numberOfTrailingZeros(maskB);
-                final int delimiterByteC = Long.numberOfTrailingZeros(maskC);
+                trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
+                trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
 
-                final long semicolonA = cursorA + (delimiterByteA >> 3);
-                final long semicolonB = cursorB + (delimiterByteB >> 3);
-                final long semicolonC = cursorC + (delimiterByteC >> 3);
+                final long semicolonA = cursorA + trailingZerosA;
+                final long semicolonB = cursorB + trailingZerosB;
 
                 long digitStartA = semicolonA + 1;
                 long digitStartB = semicolonB + 1;
-                long digitStartC = semicolonC + 1;
+
+                long lastWordMaskA = HASH_MASKS[trailingZerosA];
+                long lastWordMaskB = HASH_MASKS[trailingZerosB];
 
                 long temperatureWordA = UNSAFE.getLong(digitStartA);
                 long temperatureWordB = UNSAFE.getLong(digitStartB);
-                long temperatureWordC = UNSAFE.getLong(digitStartC);
-
-                long lastWordMaskA = ((maskA - 1) ^ maskA) >>> 8;
-                long lastWordMaskB = ((maskB - 1) ^ maskB) >>> 8;
-                long lastWordMaskC = ((maskC - 1) ^ maskC) >>> 8;
 
                 final long maskedLastWordA = currentWordA & lastWordMaskA;
                 final long maskedLastWordB = currentWordB & lastWordMaskB;
-                final long maskedLastWordC = currentWordC & lastWordMaskC;
 
                 int lenA = (int) (semicolonA - startA);
                 int lenB = (int) (semicolonB - startB);
-                int lenC = (int) (semicolonC - startC);
 
                 int mapIndexA = hashA & MAP_MASK;
                 int mapIndexB = hashB & MAP_MASK;
-                int mapIndexC = hashC & MAP_MASK;
 
                 long baseEntryPtrA;
                 long baseEntryPtrB;
-                long baseEntryPtrC;
 
                 if (slowSome) {
                     if (slowA) {
@@ -537,25 +537,37 @@ public void run() {
                         baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
                     }
 
-                    if (slowC) {
-                        baseEntryPtrC = getOrCreateEntryBaseOffsetSlow(lenC, startC, hashC, maskedLastWordC);
-                    }
-                    else {
-                        baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC, fastMap);
-                    }
                 }
                 else {
                     baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
                     baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
-                    baseEntryPtrC = getOrCreateEntryBaseOffsetFast(mapIndexC, lenC, maskedLastWordC, maskedFirstWordC, fastMap);
                 }
 
                 cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
                 cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
-                cursorC = parseAndStoreTemperature(digitStartC, baseEntryPtrC, temperatureWordC);
             }
-            doTail(fastMap);
-            // System.out.println("Longest chain: " + longestChain);
+        }
+
+        private void setCursors(long current) {
+            // Credit for the whole work-stealing scheme: @thomaswue
+            // I have totally stolen it from him. I changed the order a bit to suite my taste better,
+            // but it's his code
+            long segmentStart;
+            if (current == fileStart) {
+                segmentStart = current;
+            }
+            else {
+                segmentStart = nextNewLine(current) + 1;
+            }
+            long segmentEnd = nextNewLine(Math.min(fileEnd - 1, current + SEGMENT_SIZE));
+
+            long size = (segmentEnd - segmentStart) / 2;
+            long mid = nextNewLine(segmentStart + size);
+
+            cursorA = segmentStart;
+            endA = mid;
+            cursorB = mid + 1;
+            endB = segmentEnd;
         }
 
         private static long getOrCreateEntryBaseOffsetFast(int mapIndexA, int lenA, long maskedLastWord, long maskedFirstWord, long fastMap) {
@@ -625,4 +637,9 @@ private static boolean nameMatchSlow(long start, long namePtr, long fullLen, lon
         }
     }
 
+    record StationStats(int min, int max, int count, long sum) {
+        StationStats mergeWith(StationStats other) {
+            return new StationStats(Math.min(min, other.min), Math.max(max, other.max), count + other.count, sum + other.sum);
+        }
+    }
 }

From 6a2e5058af0954a79068000bd1a0eb1d26e21cf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serkan=20=C3=96ZAL?= <sozal@catchpoint.com>
Date: Wed, 31 Jan 2024 20:21:25 +0300
Subject: [PATCH 226/268] serkan-ozal's 7th submission: (#679)

- use smaller regions (increased region count) so there will be less idle time for the workers who completed their tasks
- get rid of some configuration related stuff during initialization which might save a few tens of milliseconds hopefully
- update temperature value parsing instruction order to get benefit of ILP better (hopefully)
---
 calculate_average_serkan-ozal.sh              |  6 +-
 .../onebrc/CalculateAverage_serkan_ozal.java  | 77 ++++++++++---------
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/calculate_average_serkan-ozal.sh b/calculate_average_serkan-ozal.sh
index cce366fca..3cfbb661d 100755
--- a/calculate_average_serkan-ozal.sh
+++ b/calculate_average_serkan-ozal.sh
@@ -18,7 +18,7 @@
 JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector "
 JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UnlockDiagnosticVMOptions"
 JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:MaxInlineSize=10000 -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
-JAVA_OPTS="$JAVA_OPTS -XX:-UseCountedLoopSafepoints -XX:GuaranteedSafepointInterval=0"
+JAVA_OPTS="$JAVA_OPTS -XX:-UseCountedLoopSafepoints -XX:LoopStripMiningIter=0 -XX:GuaranteedSafepointInterval=0"
 JAVA_OPTS="$JAVA_OPTS -XX:+TrustFinalNonStaticFields -da -dsa -XX:+UseNUMA -XX:-EnableJVMCI"
 JAVA_OPTS="$JAVA_OPTS -XX:SharedArchiveFile=target/CalculateAverage_serkan_ozal_cds.jsa"
 JAVA_OPTS="$JAVA_OPTS -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0"
@@ -26,10 +26,8 @@ if [[ ! "$(uname -s)" = "Darwin" ]]; then
   JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
 fi
 
-CONFIGS="USE_SHARED_ARENA=true USE_SHARED_REGION=true CLOSE_STDOUT_ON_RESULT=true REGION_COUNT=128"
-
 #echo "Process started at $(date +%s%N | cut -b1-13)"
-eval "exec 3< <({ $CONFIGS java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_serkan_ozal; })"
+eval "exec 3< <({ java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_serkan_ozal; })"
 read <&3 result
 echo -e "$result"
 #echo "Process finished at $(date +%s%N | cut -b1-13)"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
index 53258161e..e4f5aaa82 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java
@@ -68,15 +68,15 @@ public class CalculateAverage_serkan_ozal {
 
     // Get configurations
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    private static final boolean VERBOSE = getBooleanConfig("VERBOSE", false);
-    private static final int THREAD_COUNT = getIntegerConfig("THREAD_COUNT", Runtime.getRuntime().availableProcessors());
-    private static final boolean USE_VTHREADS = getBooleanConfig("USE_VTHREADS", false);
-    private static final int VTHREAD_COUNT = getIntegerConfig("VTHREAD_COUNT", 1024);
-    private static final int REGION_COUNT = getIntegerConfig("REGION_COUNT", -1);
-    private static final boolean USE_SHARED_ARENA = getBooleanConfig("USE_SHARED_ARENA", true);
-    private static final boolean USE_SHARED_REGION = getBooleanConfig("USE_SHARED_REGION", true);
-    private static final int MAP_CAPACITY = getIntegerConfig("MAP_CAPACITY", 1 << 17);
-    private static final boolean CLOSE_STDOUT_ON_RESULT = getBooleanConfig("CLOSE_STDOUT_ON_RESULT", true);
+    private static final boolean VERBOSE = false; // getBooleanConfig("VERBOSE", false);
+    private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); // getIntegerConfig("THREAD_COUNT", Runtime.getRuntime().availableProcessors());
+    private static final boolean USE_VTHREADS = false; // getBooleanConfig("USE_VTHREADS", false);
+    private static final int VTHREAD_COUNT = 1024; // getIntegerConfig("VTHREAD_COUNT", 1024);
+    private static final int REGION_COUNT = 256; // getIntegerConfig("REGION_COUNT", -1);
+    private static final boolean USE_SHARED_ARENA = true; // getBooleanConfig("USE_SHARED_ARENA", true);
+    private static final boolean USE_SHARED_REGION = true; // getBooleanConfig("USE_SHARED_REGION", true);
+    private static final int MAP_CAPACITY = 1 << 17; // getIntegerConfig("MAP_CAPACITY", 1 << 17);
+    private static final boolean CLOSE_STDOUT_ON_RESULT = true; // getBooleanConfig("CLOSE_STDOUT_ON_RESULT", true);
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     // My dear old friend Unsafe
@@ -346,10 +346,16 @@ private long findClosestLineEnd(long endPos, long minPos) {
         // Credits: merykitty
         private long extractValue(long regionPtr, long word, OpenMap map, int entryOffset) {
             // Parse and extract value
+
+            // 1. level instruction set (no dependency between each other so can be run in parallel)
+            long signed = (~word << 59) >> 63;
             int decimalSepPos = Long.numberOfTrailingZeros(~word & 0x10101000);
+
+            // 2. level instruction set (no dependency between each other so can be run in parallel)
+            long nextPtr = regionPtr + (decimalSepPos >>> 3) + 3;
             int shift = 28 - decimalSepPos;
-            long signed = (~word << 59) >> 63;
             long designMask = ~(signed & 0xFF);
+
             long digits = ((word & designMask) << shift) & 0x0F000F0F00L;
             long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
             int value = (int) ((absValue ^ signed) - signed);
@@ -358,12 +364,10 @@ private long extractValue(long regionPtr, long word, OpenMap map, int entryOffse
             map.putValue(entryOffset, value);
 
             // Return new position
-            return regionPtr + (decimalSepPos >>> 3) + 3;
+            return nextPtr;
         }
 
         private void doProcessRegion(long regionStart, long regionEnd) {
-            final int vectorSize = BYTE_SPECIES.vectorByteSize();
-
             final long size = regionEnd - regionStart;
             final long segmentSize = size / 2;
 
@@ -392,26 +396,26 @@ private void doProcessRegion(long regionStart, long regionEnd) {
                 int keyLength1 = keyVector1.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
                 int keyLength2 = keyVector2.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
 
-                if (keyLength1 != vectorSize && keyLength2 != vectorSize) {
+                if (keyLength1 != BYTE_SPECIES_SIZE && keyLength2 != BYTE_SPECIES_SIZE) {
                     regionPtr1 += (keyLength1 + 1);
                     regionPtr2 += (keyLength2 + 1);
                 }
                 else {
-                    if (keyLength1 != vectorSize) {
+                    if (keyLength1 != BYTE_SPECIES_SIZE) {
                         regionPtr1 += (keyLength1 + 1);
                     }
                     else {
-                        regionPtr1 += vectorSize;
+                        regionPtr1 += BYTE_SPECIES_SIZE;
                         for (; U.getByte(regionPtr1) != KEY_VALUE_SEPARATOR; regionPtr1++)
                             ;
                         keyLength1 = (int) (regionPtr1 - keyStartPtr1);
                         regionPtr1++;
                     }
-                    if (keyLength2 != vectorSize) {
+                    if (keyLength2 != BYTE_SPECIES_SIZE) {
                         regionPtr2 += (keyLength2 + 1);
                     }
                     else {
-                        regionPtr2 += vectorSize;
+                        regionPtr2 += BYTE_SPECIES_SIZE;
                         for (; U.getByte(regionPtr2) != KEY_VALUE_SEPARATOR; regionPtr2++)
                             ;
                         keyLength2 = (int) (regionPtr2 - keyStartPtr2);
@@ -431,28 +435,28 @@ private void doProcessRegion(long regionStart, long regionEnd) {
                 // Calculate key hashes and find entry indexes
                 ////////////////////////////////////////////////////////////////////////////////////////////////////////
                 int x1, y1, x2, y2;
-                if (keyLength1 >= Integer.BYTES && keyLength2 >= Integer.BYTES) {
+                if (keyLength1 > 3 && keyLength2 > 3) {
                     x1 = U.getInt(keyStartPtr1);
-                    y1 = U.getInt(keyStartPtr1 + keyLength1 - Integer.BYTES);
+                    y1 = U.getInt(regionPtr1 - 5);
                     x2 = U.getInt(keyStartPtr2);
-                    y2 = U.getInt(keyStartPtr2 + keyLength2 - Integer.BYTES);
+                    y2 = U.getInt(regionPtr2 - 5);
                 }
                 else {
-                    if (keyLength1 >= Integer.BYTES) {
+                    if (keyLength1 > 3) {
                         x1 = U.getInt(keyStartPtr1);
-                        y1 = U.getInt(keyStartPtr1 + keyLength1 - Integer.BYTES);
+                        y1 = U.getInt(regionPtr1 - 5);
                     }
                     else {
                         x1 = U.getByte(keyStartPtr1);
-                        y1 = U.getByte(keyStartPtr1 + keyLength1 - Byte.BYTES);
+                        y1 = U.getByte(regionPtr1 - 2);
                     }
-                    if (keyLength2 >= Integer.BYTES) {
+                    if (keyLength2 > 3) {
                         x2 = U.getInt(keyStartPtr2);
-                        y2 = U.getInt(keyStartPtr2 + keyLength2 - Integer.BYTES);
+                        y2 = U.getInt(regionPtr2 - 5);
                     }
                     else {
                         x2 = U.getByte(keyStartPtr2);
-                        y2 = U.getByte(keyStartPtr2 + keyLength2 - Byte.BYTES);
+                        y2 = U.getByte(regionPtr2 - 2);
                     }
                 }
 
@@ -477,19 +481,19 @@ private void doProcessRegion(long regionStart, long regionEnd) {
             }
 
             // Read and process region - tail
-            doProcessTail(regionPtr1, regionEnd1, regionPtr2, regionEnd2, vectorSize);
+            doProcessTail(regionPtr1, regionEnd1, regionPtr2, regionEnd2);
         }
 
-        private void doProcessTail(long regionPtr1, long regionEnd1, long regionPtr2, long regionEnd2, int vectorSize) {
+        private void doProcessTail(long regionPtr1, long regionEnd1, long regionPtr2, long regionEnd2) {
             while (regionPtr1 < regionEnd1) {
                 long keyStartPtr1 = regionPtr1;
                 ByteVector keyVector1 = ByteVector.fromMemorySegment(BYTE_SPECIES, NULL, regionPtr1, NATIVE_BYTE_ORDER);
                 int keyLength1 = keyVector1.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
-                if (keyLength1 != vectorSize) {
+                if (keyLength1 != BYTE_SPECIES_SIZE) {
                     regionPtr1 += (keyLength1 + 1);
                 }
                 else {
-                    regionPtr1 += vectorSize;
+                    regionPtr1 += BYTE_SPECIES_SIZE;
                     for (; U.getByte(regionPtr1) != KEY_VALUE_SEPARATOR; regionPtr1++)
                         ;
                     keyLength1 = (int) (regionPtr1 - keyStartPtr1);
@@ -507,11 +511,11 @@ private void doProcessTail(long regionPtr1, long regionEnd1, long regionPtr2, lo
                 long keyStartPtr2 = regionPtr2;
                 ByteVector keyVector2 = ByteVector.fromMemorySegment(BYTE_SPECIES, NULL, regionPtr2, NATIVE_BYTE_ORDER);
                 int keyLength2 = keyVector2.compare(VectorOperators.EQ, KEY_VALUE_SEPARATOR).firstTrue();
-                if (keyLength2 != vectorSize) {
+                if (keyLength2 != BYTE_SPECIES_SIZE) {
                     regionPtr2 += (keyLength2 + 1);
                 }
                 else {
-                    regionPtr2 += vectorSize;
+                    regionPtr2 += BYTE_SPECIES_SIZE;
                     for (; U.getByte(regionPtr2) != KEY_VALUE_SEPARATOR; regionPtr2++)
                         ;
                     keyLength2 = (int) (regionPtr2 - keyStartPtr2);
@@ -804,16 +808,17 @@ else if (keyLength <= BYTE_SPECIES_SIZE) {
 
         private void putValue(int entryOffset, int value) {
             int countOffset = entryOffset + COUNT_OFFSET;
-            U.putInt(data, countOffset, U.getInt(data, countOffset) + 1);
             int minValueOffset = entryOffset + MIN_VALUE_OFFSET;
+            int maxValueOffset = entryOffset + MAX_VALUE_OFFSET;
+            int sumOffset = entryOffset + VALUE_SUM_OFFSET;
+
+            U.putInt(data, countOffset, U.getInt(data, countOffset) + 1);
             if (value < U.getShort(data, minValueOffset)) {
                 U.putShort(data, minValueOffset, (short) value);
             }
-            int maxValueOffset = entryOffset + MAX_VALUE_OFFSET;
             if (value > U.getShort(data, maxValueOffset)) {
                 U.putShort(data, maxValueOffset, (short) value);
             }
-            int sumOffset = entryOffset + VALUE_SUM_OFFSET;
             U.putLong(data, sumOffset, U.getLong(data, sumOffset) + value);
         }
 

From f0f6570975ef50ced4fb89e09743a483ce66e832 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 18:29:36 +0100
Subject: [PATCH 227/268] Leaderboard update

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 4c29c38de..61a791790 100644
--- a/README.md
+++ b/README.md
@@ -47,11 +47,11 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:01.832 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 2 | 00:01.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3 | 00:01.948 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+| 1 | 00:01.645 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+| 2 | 00:01.832 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 3 | 00:01.880 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+|   | 00:01.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
 |   | 00:01.970 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-|   | 00:02.081 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 |   | 00:02.205 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
@@ -73,6 +73,8 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.101 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:04.209 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| 21.0.1-open | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
+|   | 00:04.474 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
+|   | 00:04.676 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.2-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
@@ -81,14 +83,12 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
+|   | 00:05.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
-|   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.1-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:05.354 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
-|   | 00:05.705 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
 |   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
-|   | 00:05.971 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |

From 3c454d0222f787bb09d63d5d8133509c44d65008 Mon Sep 17 00:00:00 2001
From: Van Phu DO <abeobk@gmail.com>
Date: Thu, 1 Feb 2024 04:03:20 +0900
Subject: [PATCH 228/268] final version for abeobk (#654)

* final version

* Correct stupid mistake

* min/max trick does not help that much, setting initial value does.

* cut the tail
---
 prepare_abeobk.sh                             |   2 +-
 .../onebrc/CalculateAverage_abeobk.java       | 456 ++++++++++--------
 2 files changed, 248 insertions(+), 210 deletions(-)

diff --git a/prepare_abeobk.sh b/prepare_abeobk.sh
index 08a8afdcb..380e2093c 100755
--- a/prepare_abeobk.sh
+++ b/prepare_abeobk.sh
@@ -20,6 +20,6 @@ sdk use java 21.0.2-graal 1>&2
 
 # ./mvnw clean verify removes target/ and will re-trigger native image creation.
 if [ ! -f target/CalculateAverage_abeobk_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -dsa -march=native -H:InlineAllBonus=10 -H:-GenLoopSafepoints -H:-ParseRuntimeOptions --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -H:InlineAllBonus=10 -H:-GenLoopSafepoints --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_abeobk"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_abeobk_image dev.morling.onebrc.CalculateAverage_abeobk
 fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
index 2340bca79..88de5d2a9 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java
@@ -34,7 +34,6 @@
 import sun.misc.Unsafe;
 
 public class CalculateAverage_abeobk {
-    private static final boolean SHOW_ANALYSIS = false;
     private static final int CPU_CNT = Runtime.getRuntime().availableProcessors();
 
     private static final String FILE = "./measurements.txt";
@@ -42,7 +41,7 @@ public class CalculateAverage_abeobk {
     private static final long BUCKET_MASK = BUCKET_SIZE - 1;
     private static final int MAX_STR_LEN = 100;
     private static final int MAX_STATIONS = 10000;
-    private static final long CHUNK_SZ = 1 << 22; // 4MB chunk
+    private static final long CHUNK_SZ = 1 << 22;
     private static final Unsafe UNSAFE = initUnsafe();
     private static final long[] HASH_MASKS = new long[]{
             0x0L,
@@ -60,10 +59,6 @@ public class CalculateAverage_abeobk {
     private static int chunk_cnt;
     private static long start_addr, end_addr;
 
-    private static final void debug(String s, Object... args) {
-        System.out.println(String.format(s, args));
-    }
-
     private static Unsafe initUnsafe() {
         try {
             Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
@@ -75,12 +70,117 @@ private static Unsafe initUnsafe() {
         }
     }
 
-    // use native type, less conversion
-    static class Node {
+    /*
+     * MAIN FUNCTION
+     */
+    public static void main(String[] args) throws InterruptedException, IOException {
+        // thomaswue trick
+        if (args.length == 0 || !("--worker".equals(args[0]))) {
+            spawnWorker();
+            return;
+        }
+
+        var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
+        long file_size = file.size();
+        start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
+        end_addr = start_addr + file_size;
+
+        // only use all cpus on large file
+        int cpu_cnt = file_size < 1e6 ? 1 : CPU_CNT;
+        chunk_cnt = (int) Math.ceilDiv(file_size, CHUNK_SZ);
+
+        // spawn workers
+        for (var w : IntStream.range(0, cpu_cnt).mapToObj(i -> new Worker(i)).toList()) {
+            w.join();
+        }
+
+        // collect results
+        TreeMap<String, Node> ms = new TreeMap<>();
+        for (var crr : mapref.get()) {
+            if (crr == null)
+                continue;
+            var prev = ms.putIfAbsent(crr.key(), crr);
+            if (prev != null)
+                prev.merge(crr);
+        }
+        // print result
+        System.out.println(ms);
+        System.out.close();
+    }
+
+    /*
+     * HELPER FUNCTIONS
+     */
+
+    // Get semicolon pos code
+    static final long getSemiCode(final long w) {
+        long x = w ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
+        return (x - 0x0101010101010101L) & (~x & 0x8080808080808080L);
+    }
+
+    // Get new line pos code
+    static final long getLFCode(final long w) {
+        long x = w ^ 0x0A0A0A0A0A0A0A0AL; // xor with \n\n\n\n\n\n\n\n
+        return (x - 0x0101010101010101L) & (~x & 0x8080808080808080L);
+    }
+
+    // Get decimal point pos code
+    static final int getDotCode(final long w) {
+        return Long.numberOfTrailingZeros(~w & 0x10101000);
+    }
+
+    // Convert semicolon pos code to position
+    static final int getSemiPos(final long spc) {
+        return Long.numberOfTrailingZeros(spc) >>> 3;
+    }
+
+    // Find next line address
+    static final long nextLF(long addr) {
+        long word = UNSAFE.getLong(addr);
+        long lfpos_code = getLFCode(word);
+        while (lfpos_code == 0) {
+            addr += 8;
+            word = UNSAFE.getLong(addr);
+            lfpos_code = getLFCode(word);
+        }
+        return addr + (Long.numberOfTrailingZeros(lfpos_code) >>> 3) + 1;
+    }
+
+    // Parse number
+    // great idea from merykitty (Quan Anh Mai)
+    static final long num(long w, int d) {
+        int shift = 28 - d;
+        long signed = (~w << 59) >> 63;
+        long dsmask = ~(signed & 0xFF);
+        long digits = ((w & dsmask) << shift) & 0x0F000F0F00L;
+        long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+        return ((abs_val ^ signed) - signed);
+    }
+
+    // Hash mixer
+    static final long mix(long hash) {
+        long h = hash * 37;
+        return (h ^ (h >>> 29));
+    }
+
+    // Spawn worker (thomaswue trick
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder()
+                .command(workerCommand)
+                .start()
+                .getInputStream()
+                .transferTo(System.out);
+    }
+
+    final static class Node {
         long addr;
         long hash;
         long word0;
-        long tail;
         long sum;
         long min, max;
         int keylen;
@@ -98,23 +198,36 @@ final String key() {
             return new String(sbuf, 0, (int) keylen, StandardCharsets.UTF_8);
         }
 
-        Node(long a, long t, int kl, long h) {
+        Node(long a, long h, int kl, long v) {
+            addr = a;
+            min = max = v;
+            keylen = kl;
+            hash = h;
+        }
+
+        Node(long a, long h, int kl) {
             addr = a;
-            tail = t;
+            hash = h;
             min = 999;
             max = -999;
             keylen = kl;
+        }
+
+        Node(long a, long w0, long h, int kl, long v) {
+            addr = a;
+            word0 = w0;
             hash = h;
+            min = max = v;
+            keylen = kl;
         }
 
-        Node(long a, long w0, long t, int kl, long h) {
+        Node(long a, long w0, long h, int kl) {
             addr = a;
             word0 = w0;
+            hash = h;
             min = 999;
             max = -999;
-            tail = t;
             keylen = kl;
-            hash = h;
         }
 
         final void add(long val) {
@@ -139,8 +252,8 @@ final void merge(Node other) {
             }
         }
 
-        final boolean contentEquals(long other_addr, long other_word0, long other_tail, long kl) {
-            if (word0 != other_word0 || tail != other_tail)
+        final boolean contentEquals(long other_addr, long other_word0, long other_hash, long kl) {
+            if (word0 != other_word0 || hash != other_hash)
                 return false;
             // this is faster than comparision if key is short
             long xsum = 0;
@@ -152,7 +265,7 @@ final boolean contentEquals(long other_addr, long other_word0, long other_tail,
         }
 
         final boolean contentEquals(Node other) {
-            if (tail != other.tail)
+            if (hash != other.hash)
                 return false;
             long n = keylen & 0xF8;
             for (long i = 0; i < n; i += 8) {
@@ -163,150 +276,13 @@ final boolean contentEquals(Node other) {
         }
     }
 
-    // idea from royvanrijn
-    static final long getSemiPosCode(final long word) {
-        long xor_semi = word ^ 0x3b3b3b3b3b3b3b3bL; // xor with ;;;;;;;;
-        return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
-    }
-
-    static final long getLFCode(final long word) {
-        long xor_semi = word ^ 0x0A0A0A0A0A0A0A0AL; // xor with \n\n\n\n\n\n\n\n
-        return (xor_semi - 0x0101010101010101L) & (~xor_semi & 0x8080808080808080L);
-    }
-
-    static final long nextLine(long addr) {
-        long word = UNSAFE.getLong(addr);
-        long lfpos_code = getLFCode(word);
-        while (lfpos_code == 0) {
-            addr += 8;
-            word = UNSAFE.getLong(addr);
-            lfpos_code = getLFCode(word);
-        }
-        return addr + (Long.numberOfTrailingZeros(lfpos_code) >>> 3) + 1;
-    }
-
-    // speed/collision balance
-    static final long xxh32(long hash) {
-        long h = hash * 37;
-        return (h ^ (h >>> 29));
-    }
-
-    static final class ChunkParser {
-        long addr;
-        long end;
-        Node[] map;
-
-        ChunkParser(Node[] m, long a, long e) {
-            map = m;
-            addr = a;
-            end = e;
-        }
-
-        final boolean ok() {
-            return addr < end;
-        }
-
-        final long word() {
-            return UNSAFE.getLong(addr);
-        }
-
-        final long val() {
-            long num_word = UNSAFE.getLong(addr);
-            int dot_pos = Long.numberOfTrailingZeros(~num_word & 0x10101000);
-            addr += (dot_pos >>> 3) + 3;
-            // great idea from merykitty (Quan Anh Mai)
-            int shift = 28 - dot_pos;
-            long signed = (~num_word << 59) >> 63;
-            long dsmask = ~(signed & 0xFF);
-            long digits = ((num_word & dsmask) << shift) & 0x0F000F0F00L;
-            long abs_val = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-            return ((abs_val ^ signed) - signed);
-        }
-
-        // optimize for contest
-        // save as much slow memory access as possible
-        // about 50% key < 8chars, 25% key bettween 8-10 chars
-        // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
-        final Node key(long word0, long semipos_code) {
-            long row_addr = addr;
-            // about 50% chance key < 8 chars
-            if (semipos_code != 0) {
-                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos + 1;
-                long tail = word0 & HASH_MASKS[semi_pos];
-                long hash = xxh32(tail);
-                int bucket = (int) (hash & BUCKET_MASK);
-                while (true) {
-                    Node node = map[bucket];
-                    if (node == null) {
-                        return (map[bucket] = new Node(row_addr, tail, semi_pos, hash));
-                    }
-                    if (node.tail == tail) {
-                        return node;
-                    }
-                    bucket++;
-                }
-            }
-
-            addr += 8;
-            long word = UNSAFE.getLong(addr);
-            semipos_code = getSemiPosCode(word);
-            // 43% chance
-            if (semipos_code != 0) {
-                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-                addr += semi_pos + 1;
-                long tail = (word & HASH_MASKS[semi_pos]);
-                long hash = xxh32(word0 ^ tail);
-                int bucket = (int) (hash & BUCKET_MASK);
-                while (true) {
-                    Node node = map[bucket];
-                    if (node == null) {
-                        return (map[bucket] = new Node(row_addr, word0, tail, semi_pos + 8, hash));
-                    }
-                    if (node.word0 == word0 && node.tail == tail) {
-                        return node;
-                    }
-                    bucket++;
-                }
-            }
-
-            // why not going for more? tested, slower
-            long hash = word0;
-            while (semipos_code == 0) {
-                hash ^= word;
-                addr += 8;
-                word = UNSAFE.getLong(addr);
-                semipos_code = getSemiPosCode(word);
-            }
-
-            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
-            addr += semi_pos;
-            long keylen = addr - row_addr;
-            addr++;
-            long tail = (word & HASH_MASKS[semi_pos]);
-            hash = xxh32(hash ^ tail);
-            int bucket = (int) (hash & BUCKET_MASK);
-
-            while (true) {
-                Node node = map[bucket];
-                if (node == null) {
-                    return (map[bucket] = new Node(row_addr, word0, tail, (int) keylen, hash));
-                }
-                if (node.contentEquals(row_addr, word0, tail, keylen)) {
-                    return node;
-                }
-                bucket++;
-            }
-        }
-    }
-
     // Thread pool worker
     static final class Worker extends Thread {
         final int thread_id; // for debug use only
-        int cls = 0;
 
         Worker(int i) {
             thread_id = i;
+            this.setPriority(Thread.MAX_PRIORITY);
             this.start();
         }
 
@@ -322,15 +298,15 @@ public void run() {
 
                 // find start of line
                 if (id > 0) {
-                    addr = nextLine(addr);
+                    addr = nextLF(addr);
                 }
 
                 final int num_segs = 3;
                 long seglen = (end - addr) / num_segs;
 
                 long a0 = addr;
-                long a1 = nextLine(addr + 1 * seglen);
-                long a2 = nextLine(addr + 2 * seglen);
+                long a1 = nextLF(addr + 1 * seglen);
+                long a2 = nextLF(addr + 2 * seglen);
                 ChunkParser p0 = new ChunkParser(map, a0, a1);
                 ChunkParser p1 = new ChunkParser(map, a1, a2);
                 ChunkParser p2 = new ChunkParser(map, a2, end);
@@ -339,9 +315,9 @@ public void run() {
                     long w0 = p0.word();
                     long w1 = p1.word();
                     long w2 = p2.word();
-                    long sc0 = getSemiPosCode(w0);
-                    long sc1 = getSemiPosCode(w1);
-                    long sc2 = getSemiPosCode(w2);
+                    long sc0 = getSemiCode(w0);
+                    long sc1 = getSemiCode(w1);
+                    long sc2 = getSemiCode(w2);
                     Node n0 = p0.key(w0, sc0);
                     Node n1 = p1.key(w1, sc1);
                     Node n2 = p2.key(w2, sc2);
@@ -355,21 +331,21 @@ public void run() {
 
                 while (p0.ok()) {
                     long w = p0.word();
-                    long sc = getSemiPosCode(w);
+                    long sc = getSemiCode(w);
                     Node n = p0.key(w, sc);
                     long v = p0.val();
                     n.add(v);
                 }
                 while (p1.ok()) {
                     long w = p1.word();
-                    long sc = getSemiPosCode(w);
+                    long sc = getSemiCode(w);
                     Node n = p1.key(w, sc);
                     long v = p1.val();
                     n.add(v);
                 }
                 while (p2.ok()) {
                     long w = p2.word();
-                    long sc = getSemiPosCode(w);
+                    long sc = getSemiCode(w);
                     Node n = p2.key(w, sc);
                     long v = p2.val();
                     n.add(v);
@@ -396,65 +372,127 @@ public void run() {
                                 break;
                             }
                             bucket++;
-                            if (SHOW_ANALYSIS)
-                                cls++;
                         }
                     }
                 }
             }
-
-            if (SHOW_ANALYSIS) {
-                debug("Thread %d collision = %d", thread_id, cls);
-            }
         }
     }
 
-    // thomaswue trick
-    private static void spawnWorker() throws IOException {
-        ProcessHandle.Info info = ProcessHandle.current().info();
-        ArrayList<String> workerCommand = new ArrayList<>();
-        info.command().ifPresent(workerCommand::add);
-        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
-        workerCommand.add("--worker");
-        new ProcessBuilder()
-                .command(workerCommand)
-                .start()
-                .getInputStream()
-                .transferTo(System.out);
-    }
+    static final class ChunkParser {
+        long addr;
+        long end;
+        Node[] map;
 
-    public static void main(String[] args) throws InterruptedException, IOException {
-        // thomaswue trick
-        if (args.length == 0 || !("--worker".equals(args[0]))) {
-            spawnWorker();
-            return;
+        ChunkParser(Node[] m, long a, long e) {
+            map = m;
+            addr = a;
+            end = e;
         }
 
-        var file = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
-        long file_size = file.size();
-        start_addr = file.map(MapMode.READ_ONLY, 0, file.size(), Arena.global()).address();
-        end_addr = start_addr + file_size;
+        final boolean ok() {
+            return addr < end;
+        }
 
-        // only use all cpus on large file
-        int cpu_cnt = file_size < 1e6 ? 1 : CPU_CNT;
-        chunk_cnt = (int) Math.ceilDiv(file_size, CHUNK_SZ);
+        final long word() {
+            return UNSAFE.getLong(addr);
+        }
 
-        // spawn workers
-        for (var w : IntStream.range(0, cpu_cnt).mapToObj(i -> new Worker(i)).toList()) {
-            w.join();
+        final void skip(int n) {
+            addr += n;
         }
 
-        // collect results
-        TreeMap<String, Node> ms = new TreeMap<>();
-        for (var crr : mapref.get()) {
-            if (crr == null)
-                continue;
-            var prev = ms.putIfAbsent(crr.key(), crr);
-            if (prev != null)
-                prev.merge(crr);
+        final void skip(long n) {
+            addr += n;
+        }
+
+        final long val0() {
+            long w = word();
+            int d = getDotCode(w);
+            return num(w, d);
+        }
+
+        final long val() {
+            long w = word();
+            int d = getDotCode(w);
+            skip((d >>> 3) + 3);
+            return num(w, d);
+        }
+
+        // optimize for contest
+        // save as much slow memory access as possible
+        // about 50% key < 8chars, 25% key bettween 8-10 chars
+        // keylength histogram (%) = [0, 0, 0, 0, 4, 10, 21, 15, 13, 11, 6, 6, 4, 2...
+        final Node key(long word0, long semipos_code) {
+            long row_addr = addr;
+            // about 50% chance key < 8 chars
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                skip(semi_pos + 1);
+                long tail = word0 & HASH_MASKS[semi_pos];
+                long hash = mix(tail);
+                int bucket = (int) (hash & BUCKET_MASK);
+                while (true) {
+                    Node node = map[bucket];
+                    if (node == null) {
+                        return (map[bucket] = new Node(row_addr, hash, semi_pos));
+                    }
+                    if (node.hash == hash) {
+                        return node;
+                    }
+                    bucket++;
+                }
+            }
+
+            skip(8);
+            long word = UNSAFE.getLong(addr);
+            semipos_code = getSemiCode(word);
+            // 43% chance
+            if (semipos_code != 0) {
+                int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+                skip(semi_pos + 1);
+                long tail = word0 ^ (word & HASH_MASKS[semi_pos]);
+                long hash = mix(tail);
+                int bucket = (int) (hash & BUCKET_MASK);
+                while (true) {
+                    Node node = map[bucket];
+                    if (node == null) {
+                        return (map[bucket] = new Node(row_addr, word0, hash, semi_pos + 8));
+                    }
+                    if (node.word0 == word0 && node.hash == hash) {
+                        return node;
+                    }
+                    bucket++;
+                }
+            }
+
+            // why not going for more? tested, slower
+            long hash = word0;
+            while (semipos_code == 0) {
+                hash ^= word;
+                skip(8);
+                word = UNSAFE.getLong(addr);
+                semipos_code = getSemiCode(word);
+            }
+
+            int semi_pos = Long.numberOfTrailingZeros(semipos_code) >>> 3;
+            skip(semi_pos);
+            long keylen = addr - row_addr;
+            skip(1);
+            long tail = hash ^ (word & HASH_MASKS[semi_pos]);
+            hash = mix(tail);
+            int bucket = (int) (hash & BUCKET_MASK);
+
+            while (true) {
+                Node node = map[bucket];
+                if (node == null) {
+                    return (map[bucket] = new Node(row_addr, word0, hash, (int) keylen));
+                }
+                if (node.contentEquals(row_addr, word0, hash, keylen)) {
+                    return node;
+                }
+                bucket++;
+            }
         }
-        // print result
-        System.out.println(ms);
-        System.out.close();
     }
 }
\ No newline at end of file

From f1fd7b7fe502663a24c277b0e8332f703e1460c9 Mon Sep 17 00:00:00 2001
From: Dr Ian Preston <157221403+ianopolousfast@users.noreply.github.com>
Date: Wed, 31 Jan 2024 19:05:47 +0000
Subject: [PATCH 229/268] actually use jvm args! (#688)

Co-authored-by: Ian Preston <ianopolous@protonmail.com>
---
 calculate_average_ianopolousfast.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/calculate_average_ianopolousfast.sh b/calculate_average_ianopolousfast.sh
index 4ed77c70a..56d5a856e 100755
--- a/calculate_average_ianopolousfast.sh
+++ b/calculate_average_ianopolousfast.sh
@@ -15,7 +15,6 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector"
-#-Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 -XX:-UseTransparentHugePages"
+JAVA_OPTS="--enable-preview --add-modules=jdk.incubator.vector -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 -XX:-UseTransparentHugePages"
 
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ianopolousfast

From d496adb04953d8cbbf12eb5a535ccb8069d20ae1 Mon Sep 17 00:00:00 2001
From: Cedric Boes <16071970+cb0s@users.noreply.github.com>
Date: Wed, 31 Jan 2024 20:13:40 +0100
Subject: [PATCH 230/268] Adding solution for cb0s (#575)

* feat: add solution for cb0s

* Update prepare_cb0s.sh

* Update calculate_average_cb0s.sh

* Update prepare_cb0s.sh

---------

Co-authored-by: Gunnar Morling <gunnar.morling@googlemail.com>
---
 calculate_average_cb0s.sh                     |  20 ++
 prepare_cb0s.sh                               |  20 ++
 .../morling/onebrc/CalculateAverage_cb0s.java | 338 ++++++++++++++++++
 3 files changed, 378 insertions(+)
 create mode 100755 calculate_average_cb0s.sh
 create mode 100755 prepare_cb0s.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_cb0s.java

diff --git a/calculate_average_cb0s.sh b/calculate_average_cb0s.sh
new file mode 100755
index 000000000..af5a93ab4
--- /dev/null
+++ b/calculate_average_cb0s.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Arguments
+JAVA_OPTS="--enable-preview -XX:MaxGCPauseMillis=1 -XX:-AlwaysPreTouch -XX:+UseParallelGC -XX:+TieredCompilation"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_cb0s
diff --git a/prepare_cb0s.sh b/prepare_cb0s.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_cb0s.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_cb0s.java b/src/main/java/dev/morling/onebrc/CalculateAverage_cb0s.java
new file mode 100644
index 000000000..1e9c7058f
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_cb0s.java
@@ -0,0 +1,338 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public class CalculateAverage_cb0s {
+
+    private static final String FILE = "./measurements.txt";
+    private static final int INPUT_BUFFER_SIZE = 1 << 16; // yields the best performance on my system...
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        run();
+        // benchmark();
+    }
+
+    private static void benchmark() throws IOException {
+        var startTime = System.currentTimeMillis();
+        for (int count = 0; count < 3; ++count) {
+            run();
+        }
+        var stopTime = System.currentTimeMillis();
+
+        System.out.println(STR."Running 3 times took: \{stopTime - startTime}ms (1 run: \{(stopTime - startTime) / 3}ms)");
+    }
+
+    private static void run() throws IOException {
+        var fileSize = getFileSize();
+
+        // for consistency for smaller files (actually a mess, could be solved more elegantly in the parsing step)
+        var processors = Runtime.getRuntime().availableProcessors();
+        processors = Math.max(1, Math.min(processors, (int) fileSize / 106));
+        while (fileSize / processors < INPUT_BUFFER_SIZE && processors > 1)
+            --processors;
+
+        var chunkSize = fileSize / processors;
+
+        System.out.write('{');
+
+        // for getting a bit more out of this solution, we don't check for null
+        var mergedResults = IntStream.range(0, processors)
+                .parallel()
+                .mapToObj(i -> processChunk(i, chunkSize))
+                .reduce(TempResultStorage::merge).get();
+
+        var endResult = mergedResults.aggregatedResultsPreOrdered.stream()
+                .map(Station::toString)
+                .collect(Collectors.joining(", "));
+
+        System.out.write(endResult.getBytes());
+
+        System.out.write(new byte[]{ '}', '\n' });
+    }
+
+    private static class MeasurementAggregator {
+        public MeasurementAggregator(int initialValue) {
+            min = initialValue;
+            max = initialValue;
+            count = 1;
+            sum = initialValue;
+        }
+
+        public int min, max, count;
+        // we need to long if the possible absolute sum is greater than 2^31
+        public long sum;
+    }
+
+    private record Station(
+            MeasurementAggregator results,
+            RawName rawName
+    ) implements Comparable<Station> {
+
+    @Override
+    public boolean equals(Object otherObject) {
+        if (otherObject instanceof Station otherStation) {
+            return otherStation.rawName.equals(rawName);
+        }
+        return false;
+    }
+
+    @Override
+    public int compareTo(Station otherStation) {
+        return rawName.compareTo(otherStation.rawName);
+    }
+
+    @Override
+    public String toString() {
+        return STR."\{rawName}=\{results.min/10.0}/\{Math.round(results.sum / (float) results.count) / 10.0}/\{results.max/10.0}";
+    }
+
+    @Override
+    public int hashCode() {
+        return rawName.hashCode();
+    }
+
+    }
+
+    private record RawName(
+            byte[] rawName
+    ) implements Comparable<RawName> {
+
+    @Override
+    public boolean equals(Object otherObject) {
+        RawName otherRawName = (RawName) otherObject;
+        return Arrays.equals(otherRawName.rawName, this.rawName);
+
+        /*
+         * Although being safer, comparing actually is a small bottleneck
+         * if (otherObject instanceof RawName otherRawName) {
+         * return Arrays.equals(otherRawName.rawName, this.rawName);
+         * }
+         * return false;
+         */
+    }
+
+    @Override
+    public int hashCode() {
+        return Arrays.hashCode(rawName);
+    }
+
+    @Override
+    public String toString() {
+        return new String(rawName, 0, rawName.length, StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public int compareTo(RawName otherRawName) {
+        int result = 0;
+        // Math.min is SLIGHTLY less efficient, but we don't care at this point
+        var lowerIndex = Math.min(rawName.length, otherRawName.rawName.length);
+        for (int i = 0; i < lowerIndex && result == 0; ++i) {
+            result = Byte.compareUnsigned(rawName[i], otherRawName.rawName[i]);
+        }
+
+        return result == 0 ? rawName.length - otherRawName.rawName.length : result;
+    }
+}
+
+private static class TempResultStorage {
+    public void insertMeasurement(byte[] dataRow, int from, int to) {
+        // 1st parse measurement
+        var sepIndex = from + 1;
+        while (dataRow[sepIndex] != ';')
+            ++sepIndex;
+
+        var parsedMeasurement = parseMeasurement(dataRow, sepIndex + 1, to);
+
+        // 2nd handle if city occurs the first time
+        var rawName = new RawName(Arrays.copyOfRange(dataRow, from, sepIndex));
+        var tempIndex = indexCache.get(rawName);
+        if (tempIndex == null) {
+            var aggregator = new MeasurementAggregator(parsedMeasurement);
+            var tempStation = new Station(aggregator, rawName);
+            aggregatedResults.add(tempStation);
+            indexCache.put(rawName, aggregatedResults.size() - 1);
+            aggregatedResultsPreOrdered.add(tempStation);
+            return;
+        }
+
+        // or update already existing station
+        var tempResults = aggregatedResults.get(tempIndex).results;
+        // TODO: compare to: add simd vector storage and process once every 8 iterations
+
+        tempResults.sum += parsedMeasurement;
+        tempResults.count++;
+
+        if (tempResults.max < parsedMeasurement) {
+            tempResults.max = parsedMeasurement;
+        }
+        else if (tempResults.min > parsedMeasurement) {
+            tempResults.min = parsedMeasurement;
+        }
+    }
+
+    public TempResultStorage() {
+        aggregatedResults = new ArrayList<>(INITIAL_RESULT_SIZE);
+        indexCache = new HashMap<>(INITIAL_RESULT_SIZE);
+        aggregatedResultsPreOrdered = new TreeSet<>();
+    }
+
+    public static TempResultStorage merge(TempResultStorage storage0, TempResultStorage storage1) {
+        // default case
+        if (storage0 == null) {
+            return storage1;
+        }
+
+        // TODO: Implementation with SIMD commands
+        for (var station1 : storage1.aggregatedResults) {
+            // System.out.println(station1.results.count + " " + station1.results.sum);
+            var key = storage0.indexCache.get(station1.rawName);
+            if (key == null) {
+                storage0.aggregatedResults.add(station1);
+                storage0.indexCache.put(station1.rawName, storage0.aggregatedResults.size() - 1);
+                storage0.aggregatedResultsPreOrdered.add(station1);
+                continue;
+            }
+
+            var station0 = storage0.aggregatedResults.get(key);
+            station0.results.count += station1.results.count;
+            station0.results.sum += station1.results.sum;
+
+            if (station0.results.min > station1.results.min) {
+                station0.results.min = station1.results.min;
+            }
+
+            if (station1.results.max > station0.results.max) {
+                station0.results.max = station1.results.max;
+            }
+        }
+
+        return storage0;
+    }
+
+    // the closer it is to the actual value the better -> for 10_000 stations 10_000 is obviously better
+    private static final int INITIAL_RESULT_SIZE = 420;
+
+    // we use a custom name mapping for faster access to aggregatedResults and easier sorting
+    private final List<Station> aggregatedResults;
+    private final TreeSet<Station> aggregatedResultsPreOrdered;
+    private final HashMap<RawName, Integer> indexCache;
+
+    /**
+     * Parses a char[] array to the contained number in a fixed point format.
+     * The number can be between [-99.9, 99.9] (i.e. has either 2 or 3 digits and might contain a sign)
+     * and represents a temperature measurement.
+     * Note that no checking takes place. Incorrect formats yield unexpected results.
+     *
+     * @param dataRow char array actually containing the number
+     * @param from    the start index of the number inside the array (included)
+     * @param to      the end index of the number (not included, i.e. the char after the number or the length)
+     * @return fixed point (int) representation of the contained measurement
+     */
+    private int parseMeasurement(byte[] dataRow, int from, int to) {
+        // almost branch-less solution
+        int sign = -1 + 2 * ((dataRow[from] >> 4) & 1);
+
+        int floatingPoint = dataRow[to - 1] - 48;
+        int lastIntDigit = dataRow[to - 3] - 48;
+        int firstIntDigit = to - from - 4 >= 0 ? (sign + 1) / 2 * dataRow[to - 4] - 48 : 0;
+
+        if (to - from >= 4) {
+            firstIntDigit = dataRow[to - 4] - 48;
+
+            if (to - from == 4 && sign == -1) {
+                firstIntDigit = 0;
+            }
+        }
+
+        return (firstIntDigit * 100 + lastIntDigit * 10 + floatingPoint) * sign;
+    }
+
+    }
+
+    private static TempResultStorage processChunk(int i, long chunkSize) {
+        var storage = new TempResultStorage();
+        var readBuffer = new byte[INPUT_BUFFER_SIZE];
+
+        try (var inputStream = new BufferedInputStream(new FileInputStream(FILE), INPUT_BUFFER_SIZE)) {
+            var readBytes = 0L; // we set it to one because our first loop will not register last read byte
+            var readBytesDelta = 0;
+
+            // preparation
+            if (i != 0) {
+                --readBytes;
+                inputStream.skip(i * chunkSize - 1);
+                int c;
+                while ((c = inputStream.read()) != '\n' && c != -1)
+                    ++readBytes;
+            }
+
+            // actual parsing
+            // worst case: only \n is missing for a whole line
+            var carryOver = new byte[107];
+            var carryOverSize = 0;
+
+            while (readBytes < chunkSize && inputStream.available() > 0) {
+                readBytes += (readBytesDelta = inputStream.read(readBuffer, 0, readBuffer.length));
+                int from = 0, to = 0;
+
+                if (carryOverSize != 0) {
+                    while (readBuffer[to] != '\n')
+                        ++to;
+                    System.arraycopy(readBuffer, from, carryOver, carryOverSize, to - from + 1);
+
+                    storage.insertMeasurement(carryOver, 0, carryOverSize + to - from);
+                    from = ++to;
+                }
+
+                // Actually looking 5 ahead instead of 1 at each new line
+                // Minimal line consists of: [name-byte];[first_digit].[last_digit]\n
+                while (to <= readBytesDelta && (readBytes - readBytesDelta + to) < chunkSize) {
+                    to += 5;
+
+                    while (to < readBytesDelta && readBuffer[to] != '\n')
+                        ++to;
+
+                    if (to >= readBytesDelta) {
+                        System.arraycopy(readBuffer, from, carryOver, 0, readBytesDelta - from);
+                        carryOverSize = readBytesDelta - from;
+                        break;
+                    }
+
+                    storage.insertMeasurement(readBuffer, from, to);
+                    from = ++to;
+                }
+            }
+        }
+        catch (IOException e) {
+            return null; // shouldn't happen
+        }
+
+        return storage;
+    }
+
+    private static long getFileSize() {
+        return new File(CalculateAverage_cb0s.FILE).length();
+    }
+}

From e81326b83d363eaa832cc6fd63b7648bda7f25b1 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Wed, 31 Jan 2024 20:18:13 +0100
Subject: [PATCH 231/268] trying TuneInlinerExploration=1 (#662)

---
 prepare_artsiomkorzun.sh                      |   2 +-
 .../CalculateAverage_artsiomkorzun.java       | 109 ++++++++++--------
 2 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/prepare_artsiomkorzun.sh b/prepare_artsiomkorzun.sh
index d1263addb..7cbcdfc8a 100755
--- a/prepare_artsiomkorzun.sh
+++ b/prepare_artsiomkorzun.sh
@@ -19,6 +19,6 @@ source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.2-graal 1>&2
 
 if [ ! -f target/CalculateAverage_artsiomkorzun_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -R:MaxHeapSize=64m -H:-GenLoopSafepoints --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_artsiomkorzun"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native -H:TuneInlinerExploration=1 -R:MaxHeapSize=64m -H:-GenLoopSafepoints --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_artsiomkorzun"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_artsiomkorzun_image dev.morling.onebrc.CalculateAverage_artsiomkorzun
 fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index c0cc8f99e..d899c3d72 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -26,6 +26,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Map;
+import java.util.Optional;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
@@ -81,8 +82,17 @@ private static boolean isSpawn(String[] args) {
     private static void spawn() throws Exception {
         ProcessHandle.Info info = ProcessHandle.current().info();
         ArrayList<String> commands = new ArrayList<>();
-        info.command().ifPresent(commands::add);
-        info.arguments().ifPresent(args -> commands.addAll(Arrays.asList(args)));
+        Optional<String> command = info.command();
+        Optional<String[]> arguments = info.arguments();
+
+        if (command.isPresent()) {
+            commands.add(command.get());
+        }
+
+        if (arguments.isPresent()) {
+            commands.addAll(Arrays.asList(arguments.get()));
+        }
+
         commands.add("--worker");
 
         new ProcessBuilder()
@@ -113,7 +123,7 @@ private static void execute() throws Exception {
             aggregators[i].join();
         }
 
-        Map<String, Aggregate> aggregates = result.get().aggregate();
+        Map<String, Aggregate> aggregates = result.get().build();
         System.out.println(text(aggregates));
         System.out.close();
     }
@@ -163,14 +173,14 @@ private static double round(double v) {
         return Math.round(v) / 10.0;
     }
 
-    private record Aggregate(long min, long max, long sum, long cnt) {
+    private record Aggregate(int min, int max, long sum, int cnt) {
     }
 
     private static class Aggregates {
 
         private static final long ENTRIES = 64 * 1024;
-        private static final long SIZE = 256 * ENTRIES;
-        private static final long MASK = (ENTRIES - 1) << 8;
+        private static final long SIZE = 128 * ENTRIES;
+        private static final long MASK = (ENTRIES - 1) << 7;
 
         private final long pointer;
 
@@ -182,25 +192,25 @@ public Aggregates() {
 
         public long find(long word, long hash) {
             long address = pointer + offset(hash);
-            long w = word(address + 48);
+            long w = word(address + 24);
             return (w == word) ? address : 0;
         }
 
         public long find(long word1, long word2, long hash) {
             long address = pointer + offset(hash);
-            long w1 = word(address + 48);
-            long w2 = word(address + 56);
+            long w1 = word(address + 24);
+            long w2 = word(address + 32);
             return (word1 == w1) && (word2 == w2) ? address : 0;
         }
 
         public long put(long reference, long word, long length, long hash) {
             for (long offset = offset(hash);; offset = next(offset)) {
                 long address = pointer + offset;
-                if (equal(reference, word, address + 48, length)) {
+                if (equal(reference, word, address + 24, length)) {
                     return address;
                 }
 
-                long len = UNSAFE.getLong(address);
+                int len = UNSAFE.getInt(address);
                 if (len == 0) {
                     alloc(reference, length, hash, address);
                     return address;
@@ -209,76 +219,76 @@ public long put(long reference, long word, long length, long hash) {
         }
 
         public static void update(long address, long value) {
-            long sum = UNSAFE.getLong(address + 16) + value;
-            long cnt = UNSAFE.getLong(address + 24) + 1;
-            long min = UNSAFE.getLong(address + 32);
-            long max = UNSAFE.getLong(address + 40);
+            long sum = UNSAFE.getLong(address + 8) + value;
+            int cnt = UNSAFE.getInt(address + 16) + 1;
+            short min = UNSAFE.getShort(address + 20);
+            short max = UNSAFE.getShort(address + 22);
 
-            UNSAFE.putLong(address + 16, sum);
-            UNSAFE.putLong(address + 24, cnt);
+            UNSAFE.putLong(address + 8, sum);
+            UNSAFE.putInt(address + 16, cnt);
 
             if (value < min) {
-                UNSAFE.putLong(address + 32, value);
+                UNSAFE.putShort(address + 20, (short) value);
             }
 
             if (value > max) {
-                UNSAFE.putLong(address + 40, value);
+                UNSAFE.putShort(address + 22, (short) value);
             }
         }
 
         public void merge(Aggregates rights) {
-            for (int rightOffset = 0; rightOffset < SIZE; rightOffset += 256) {
+            for (long rightOffset = 0; rightOffset < SIZE; rightOffset += 128) {
                 long rightAddress = rights.pointer + rightOffset;
-                long length = UNSAFE.getLong(rightAddress);
+                int length = UNSAFE.getInt(rightAddress);
 
                 if (length == 0) {
                     continue;
                 }
 
-                long hash = UNSAFE.getLong(rightAddress + 8);
+                int hash = UNSAFE.getInt(rightAddress + 4);
 
                 for (long offset = offset(hash);; offset = next(offset)) {
                     long address = pointer + offset;
 
-                    if (equal(address + 48, rightAddress + 48, length)) {
-                        long sum = UNSAFE.getLong(address + 16) + UNSAFE.getLong(rightAddress + 16);
-                        long cnt = UNSAFE.getLong(address + 24) + UNSAFE.getLong(rightAddress + 24);
-                        long min = Math.min(UNSAFE.getLong(address + 32), UNSAFE.getLong(rightAddress + 32));
-                        long max = Math.max(UNSAFE.getLong(address + 40), UNSAFE.getLong(rightAddress + 40));
+                    if (equal(address + 24, rightAddress + 24, length)) {
+                        long sum = UNSAFE.getLong(address + 8) + UNSAFE.getLong(rightAddress + 8);
+                        int cnt = UNSAFE.getInt(address + 16) + UNSAFE.getInt(rightAddress + 16);
+                        short min = (short) Math.min(UNSAFE.getShort(address + 20), UNSAFE.getShort(rightAddress + 20));
+                        short max = (short) Math.max(UNSAFE.getShort(address + 22), UNSAFE.getShort(rightAddress + 22));
 
-                        UNSAFE.putLong(address + 16, sum);
-                        UNSAFE.putLong(address + 24, cnt);
-                        UNSAFE.putLong(address + 32, min);
-                        UNSAFE.putLong(address + 40, max);
+                        UNSAFE.putLong(address + 8, sum);
+                        UNSAFE.putInt(address + 16, cnt);
+                        UNSAFE.putShort(address + 20, min);
+                        UNSAFE.putShort(address + 22, max);
                         break;
                     }
 
-                    long len = UNSAFE.getLong(address);
+                    int len = UNSAFE.getInt(address);
 
                     if (len == 0) {
-                        UNSAFE.copyMemory(rightAddress, address, length + 48);
+                        UNSAFE.copyMemory(rightAddress, address, length + 24);
                         break;
                     }
                 }
             }
         }
 
-        public Map<String, Aggregate> aggregate() {
+        public Map<String, Aggregate> build() {
             TreeMap<String, Aggregate> set = new TreeMap<>();
 
-            for (long offset = 0; offset < SIZE; offset += 256) {
+            for (long offset = 0; offset < SIZE; offset += 128) {
                 long address = pointer + offset;
-                long length = UNSAFE.getLong(address);
+                int length = UNSAFE.getInt(address);
 
                 if (length != 0) {
-                    byte[] array = new byte[(int) length - 1];
-                    UNSAFE.copyMemory(null, address + 48, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, array.length);
+                    byte[] array = new byte[length - 1];
+                    UNSAFE.copyMemory(null, address + 24, array, Unsafe.ARRAY_BYTE_BASE_OFFSET, array.length);
                     String key = new String(array);
 
-                    long sum = UNSAFE.getLong(address + 16);
-                    long cnt = UNSAFE.getLong(address + 24);
-                    long min = UNSAFE.getLong(address + 32);
-                    long max = UNSAFE.getLong(address + 40);
+                    long sum = UNSAFE.getLong(address + 8);
+                    int cnt = UNSAFE.getInt(address + 16);
+                    short min = UNSAFE.getShort(address + 20);
+                    short max = UNSAFE.getShort(address + 22);
 
                     Aggregate aggregate = new Aggregate(min, max, sum, cnt);
                     set.put(key, aggregate);
@@ -289,11 +299,11 @@ public Map<String, Aggregate> aggregate() {
         }
 
         private static void alloc(long reference, long length, long hash, long address) {
-            UNSAFE.putLong(address, length);
-            UNSAFE.putLong(address + 8, hash);
-            UNSAFE.putLong(address + 32, Long.MAX_VALUE);
-            UNSAFE.putLong(address + 40, Long.MIN_VALUE);
-            UNSAFE.copyMemory(reference, address + 48, length);
+            UNSAFE.putInt(address, (int) length);
+            UNSAFE.putInt(address + 4, (int) hash);
+            UNSAFE.putShort(address + 20, Short.MAX_VALUE);
+            UNSAFE.putShort(address + 22, Short.MIN_VALUE);
+            UNSAFE.copyMemory(reference, address + 24, length);
         }
 
         private static long offset(long hash) {
@@ -301,7 +311,7 @@ private static long offset(long hash) {
         }
 
         private static long next(long prev) {
-            return (prev + 256) & (SIZE - 1);
+            return (prev + 128) & (SIZE - 1);
         }
 
         private static boolean equal(long leftAddress, long leftWord, long rightAddress, long length) {
@@ -510,8 +520,9 @@ private static long find(Aggregates aggregates, Chunk chunk, long word, long sep
         private static long value(Chunk chunk) {
             long num = word(chunk.position);
             long dot = dot(num);
+            long value = value(num, dot);
             chunk.position += (dot >> 3) + 3;
-            return value(num, dot);
+            return value;
         }
 
         private static long separator(long word) {

From 47046f327d482356338570228003af7eef57e548 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksey=20Shipil=C3=ABv?= <aleksey@shipilev.net>
Date: Wed, 31 Jan 2024 20:30:41 +0100
Subject: [PATCH 232/268] Shipilev: improve comments (#692)

---
 .../onebrc/CalculateAverage_shipilev.java     | 146 ++++++++++++------
 1 file changed, 95 insertions(+), 51 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java b/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
index 1150f4296..f8b78a050 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java
@@ -31,19 +31,32 @@
 
 public class CalculateAverage_shipilev {
 
-    // This might not be the fastest implementation one can do.
-    // When working on this implementation, I set the bar as follows.
+    // Detour: This implementation tries to balance the speed and readability.
     //
-    // This implementation uses vanilla and standard Java as much as possible,
-    // without relying on Unsafe tricks and preview features. If and when
-    // those are used, they should be guarded by a feature flag. This would
-    // allow running vanilla implementation if anything goes off the rails.
+    // While the original contest suggests we pull off every trick in the
+    // book to get the peak performance, here we set a more pragmatic goal:
+    // how fast we can get without going too far into hacks. Or, putting it
+    // in another way, what would be the reasonably fast implementation that
+    // would *also* pass a code review in a reasonable project, would be usable
+    // in production without waking people up in the middle of the night, and
+    // would work through JDK updates, upgrades, and migrations.
+    //
+    // To that end, this implementation uses vanilla and standard Java as much
+    // as possible, without relying on Unsafe tricks and preview features.
+    // When any non-standard things are used, they are guarded by a feature flag,
+    // which allows to cleanly turn them off when anything goes off the rails.
+    //
+    // For performance reasons, the implementation takes more care to be reliably
+    // parallel to survive I/O stalls and scheduling oddities. This would not
+    // show up in laboratory conditions, but it is a necessary thing for a reliable
+    // code in production. It also tries not to miss simple optimizations without
+    // going too far into the woods.
+    //
+    // Note that some of the magic to run this workload fast in evaluation
+    // conditions is done separately in the invocation script. Most of that
+    // is only needed for the short-running scenarios. In real life, this code
+    // would likely run well without any of that.
     //
-    // This implementation also covers the realistic scenario: the I/O is
-    // actually slow and jittery. To that end, making sure we can feed
-    // the parsing code under slow I/O is as important as getting the
-    // parsing fast. Current evaluation env keeps the input data on RAM disk,
-    // which hides this important part.
 
     // ========================= Tunables =========================
 
@@ -57,17 +70,19 @@ public class CalculateAverage_shipilev {
 
     // Fixed size of the measurements map. Must be the power of two. Should
     // be large enough to accomodate all the station names. Rules say there are
-    // 10K station names max, so anything >> 16K works well.
+    // 10K station names max, so anything more than 16K works well.
     private static final int MAP_SIZE = 1 << 15;
 
     // The largest mmap-ed chunk. This can be be Integer.MAX_VALUE, but
     // it is normally tuned down to seed the workers with smaller mmap regions
-    // more efficiently.
+    // more efficiently. This also allows to incrementally unmap chunks as we
+    // complete working on them.
     private static final int MMAP_CHUNK_SIZE = Integer.MAX_VALUE / 32;
 
     // The largest slice as unit of work, processed serially by a worker.
     // Set it too low and there would be more tasks and less batching, but
     // more parallelism. Set it too high, and the reverse would be true.
+    // Something around a large page would likely hit the right balance.
     private static final int UNIT_SLICE_SIZE = 4 * 1024 * 1024;
 
     // Employ direct unmapping techniques to alleviate the cost of system
@@ -80,6 +95,7 @@ public class CalculateAverage_shipilev {
     // ========================= Storage =========================
 
     // Thread-local measurement maps, each thread gets one.
+    // This allows workers to work nearly unimpeded without synchronization.
     // Even though crude, avoid lambdas here to alleviate startup costs.
     private static final ThreadLocal<MeasurementsMap> MAPS = ThreadLocal.withInitial(new Supplier<>() {
         @Override
@@ -90,20 +106,21 @@ public MeasurementsMap get() {
         }
     });
 
-    // After worker threads finish, the data is available here. One just needs
-    // to merge it a little.
+    // After worker threads finish, the data is available here. The reporting
+    // code would pull the maps from here, once all workers finish.
     private static final ConcurrentLinkedQueue<MeasurementsMap> ALL_MAPS = new ConcurrentLinkedQueue<>();
 
     // Releasable mmaped buffers that workers are done with. These can be un-mapped
-    // in background. Part of the protocol to shutdown the background activity is to
-    // issue the poison pill.
+    // in background. Main thread would wait on this queue, until it gets the poison
+    // pill from the root task.
     private static final LinkedBlockingQueue<ByteBuffer> RELEASABLE_BUFFERS = new LinkedBlockingQueue<>();
     private static final ByteBuffer RELEASABLE_BUFFER_POISON_PILL = ByteBuffer.allocate(1);
 
     // ========================= MEATY GRITTY PARTS: PARSE AND AGGREGATE =========================
 
     public static final class Bucket {
-        // Raw station name, its hash, and prefixes.
+        // Raw station name, encoded as two prefixes and the name tail,
+        // its total length, and hash.
         public final byte[] nameTail;
         public final int len;
         public final int hash;
@@ -118,7 +135,8 @@ public static final class Bucket {
         public Bucket(ByteBuffer slice, int begin, int end, int hash, int temp) {
             len = end - begin;
 
-            // Also pick up any prefixes to simplify future matches.
+            // Decode the station name. It is handy to have a few prefixes
+            // available to simplify matches later.
             int tailStart = 0;
             if (len >= 8) {
                 prefix1 = slice.getInt(begin + 0);
@@ -135,12 +153,15 @@ else if (len >= 4) {
                 prefix2 = 0;
             }
 
-            // The rest goes to tail byte array. We are checking it names on hot-path.
+            // The rest goes to tail byte array. We are checking reading it on hot-path.
             // Therefore, it is convenient to keep allocation for names near the buckets.
+            // One can avoid this by carefully recording the tail in a separate field,
+            // like the prefixes above, but this is simple enough to gain enough perf.
             int tailLen = len - tailStart;
             nameTail = new byte[tailLen];
             slice.get(begin + tailStart, nameTail, 0, tailLen);
 
+            // Seed the bucket with initial value.
             this.hash = hash;
             this.sum = temp;
             this.count = 1;
@@ -148,7 +169,7 @@ else if (len >= 4) {
             this.max = temp;
         }
 
-        // Little helper method to compare the array with given bytebuffer range.
+        // Little helper method to compare the array with given ByteBuffer range.
         public boolean matches(ByteBuffer cand, int begin, int end) {
             int origLen = len;
             int candLen = end - begin;
@@ -156,7 +177,7 @@ public boolean matches(ByteBuffer cand, int begin, int end) {
                 return false;
             }
 
-            // Check the prefixes first, to simplify the matches.
+            // Check the prefixes first, if we can.
             int tailStart = 0;
             if (origLen >= 8) {
                 if (prefix1 != cand.getInt(begin)) {
@@ -183,6 +204,7 @@ else if (origLen >= 4) {
             return true;
         }
 
+        // Check if current Bucket matches another.
         public boolean matches(Bucket other) {
             return len == other.len &&
                     prefix1 == other.prefix1 &&
@@ -190,9 +212,14 @@ public boolean matches(Bucket other) {
                     Arrays.equals(nameTail, other.nameTail);
         }
 
+        // Merge the temp value. Hot-path, should be fairly efficient.
         public void merge(int value) {
             sum += value;
             count++;
+
+            // We rarely do the updates, so these branches are almost
+            // never taken. Writing them as explicit branches instead of
+            // Math.{min,max} improves performance a bit.
             if (value < min) {
                 min = value;
             }
@@ -201,6 +228,7 @@ public void merge(int value) {
             }
         }
 
+        // Merge the buckets. Called during reporting, not a hot path.
         public void merge(Bucket s) {
             sum += s.sum;
             count += s.count;
@@ -209,7 +237,8 @@ public void merge(Bucket s) {
         }
 
         public Row toRow() {
-            // Reconstruct the name
+            // Reconstruct the name first. The prefixes and the tail were copied
+            // from the little-endian slice, so we need to match the endianness here.
             ByteBuffer bb = ByteBuffer.allocate(len);
             bb.order(ByteOrder.LITTLE_ENDIAN);
             if (len >= 4) {
@@ -231,7 +260,7 @@ public Row toRow() {
     // Quick and dirty linear-probing hash map. YOLO.
     public static final class MeasurementsMap {
         // Individual map buckets. Inlining these straight into map complicates
-        // the implementation without the sensible performance improvement.
+        // the implementation without much of the performance improvement.
         // The map is likely sparse, so whatever footprint loss we have due to
         // Bucket headers we gain by allocating the buckets lazily. The memory
         // dereference costs are still high in both cases. The additional benefit
@@ -240,14 +269,14 @@ public static final class MeasurementsMap {
         private final Bucket[] buckets = new Bucket[MAP_SIZE];
 
         // Fast path is inlined in seqCompute. This is a slow-path that is taken
-        // when something is off. We normally do not enter here.
+        // rarely, usually when there is a hash collision. We normally do not enter here.
         private void updateSlow(ByteBuffer name, int begin, int end, int hash, int temp) {
             int idx = hash & (MAP_SIZE - 1);
 
             while (true) {
                 Bucket cur = buckets[idx];
                 if (cur == null) {
-                    // No bucket yet, lucky us. Create the bucket with it.
+                    // No bucket yet, lucky us. Create the bucket and be done.
                     buckets[idx] = new Bucket(name, begin, end, hash, temp);
                     return;
                 }
@@ -287,9 +316,9 @@ else if ((cur.hash == other.hash) && cur.matches(other)) {
             }
         }
 
-        // Convert from internal representation to the rows.
-        // This does several major things: filters away null-s, instantates full Strings,
-        // and computes stats.
+        // Convert from internal representation to the rows. This does several
+        // major things: filters away null-s, instantates full Strings, and
+        // computes the final rows.
         public int fill(Row[] rows) {
             int idx = 0;
             for (Bucket bucket : buckets) {
@@ -308,12 +337,15 @@ public static final class ParsingTask extends CountedCompleter<Void> {
         private final MappedByteBuffer mappedBuf;
         private final ByteBuffer buf;
 
+        // Entered from the root task, records the original mmap-ed slice
+        // for later cleanup.
         public ParsingTask(CountedCompleter<Void> p, MappedByteBuffer mappedBuf) {
             super(p);
             this.mappedBuf = mappedBuf;
             this.buf = mappedBuf;
         }
 
+        // Entered from the other parsing tasks.
         public ParsingTask(CountedCompleter<Void> p, ByteBuffer buf) {
             super(p);
             this.mappedBuf = null;
@@ -334,6 +366,10 @@ public void compute() {
 
         @Override
         public void onCompletion(CountedCompleter<?> caller) {
+            // FJP API: Would be called when this task completes. At that point,
+            // we know the mmap-ed slice is not needed anymore, and can give it
+            // out for unmmaps. We do not do unmmap here, let the main thread
+            // handle it for us, as we go on doing other hot work.
             if (DIRECT_UNMMAPS && (mappedBuf != null)) {
                 RELEASABLE_BUFFERS.offer(mappedBuf);
             }
@@ -342,7 +378,7 @@ public void onCompletion(CountedCompleter<?> caller) {
         private void internalCompute() throws Exception {
             int len = buf.limit();
             if (len > UNIT_SLICE_SIZE) {
-                // Split in half.
+                // Still a large chunk, let's split it in half.
                 int mid = len / 2;
 
                 // Figure out the boundary that does not split the line.
@@ -363,13 +399,17 @@ private void internalCompute() throws Exception {
                 new ParsingTask(this, buf.slice(mid, len - mid)).compute();
             }
             else {
+                // Small enough chunk, time to process it.
                 // The call to seqCompute would normally be non-inlined.
                 // Do setup stuff here to save inlining budget.
                 MeasurementsMap map = MAPS.get();
 
                 // Force the order we need for bit extraction to work. This fits
                 // most of the hardware very well without introducing platform
-                // dependencies.
+                // dependencies. Note that it would be wrong to use nativeOrder()
+                // here, because we _need_ a particular byte ordering for our
+                // computations to work. It just so happens that most hardware
+                // we have is LE.
                 buf.order(ByteOrder.LITTLE_ENDIAN);
 
                 // Go!
@@ -387,10 +427,12 @@ private void seqCompute(MeasurementsMap map, ByteBuffer origSlice, int length) t
             // object, which allows compiler to trust its fields more thoroughly.
             ByteBuffer slice = origSlice.slice();
 
-            // Do the same endianness as the original slice.
+            // New slice lost the endianness setting, set it up as the original slice.
             slice.order(ByteOrder.LITTLE_ENDIAN);
 
-            // Touch the buffer once to let the common checks to fire once for this slice.
+            // Touch the buffer once to let the compiler eject the common checks
+            // for this slice from the loop here. This is an odd, flaky, and sometimes
+            // desperate, but a safe thing to do.
             slice.get(0);
 
             int idx = 0;
@@ -418,47 +460,46 @@ private void seqCompute(MeasurementsMap map, ByteBuffer origSlice, int length) t
                 int nameEnd = idx - 1;
 
                 // Parse out the temperature. The rules specify temperatures
-                // are within -99.9..99.9. We implicitly look ahead for
-                // negative sign and carry the negative multiplier, if found.
-                // After that, we just need to reconstruct the temperature from
-                // two or three digits. The aggregation code expects temperatures
-                // at 10x scale.
-
+                // are within -99.9..99.9. This means even in the shortest case of
+                // "0.0<EOL>", we are not out of bounds for the int-sized read.
                 int intTemp = slice.getInt(idx);
 
                 int neg = 1;
                 if ((intTemp & 0xFF) == '-') {
                     // Unlucky, there is a sign. Record it, shift one byte and read
                     // the remaining digit again. Surprisingly, doing a second read
-                    // is not worse than reading into long and trying to do bit
-                    // shifts on it.
+                    // is not significantly worse than reading into long and trying
+                    // to do bit shifts on it. But it is significantly simpler.
                     neg = -1;
                     intTemp >>>= 8;
                     intTemp |= slice.get(idx + 4) << 24;
                     idx++;
                 }
 
-                // Since the sign is consumed, we are only left with two cases:
+                // Since the sign is consumed, we are only left with two cases,
+                // which means we can trivially extract the number from int.
                 int temp = 0;
                 if ((intTemp >>> 24) == '\n') {
-                    // EOL-digitL-point-digitH
+                    // Case 1: EOL-digitL-point-digitH
                     temp = (((intTemp & 0xFF)) - '0') * 10 +
                             ((intTemp >> 16) & 0xFF) - '0';
                     idx += 4;
                 }
                 else {
-                    // digitL-point-digitH-digitHH
+                    // Case 2: digitL-point-digitH-digitHH
                     temp = (((intTemp & 0xFF)) - '0') * 100 +
                             (((intTemp >> 8) & 0xFF) - '0') * 10 +
                             (((intTemp >>> 24)) - '0');
                     idx += 5;
                 }
+
+                // All done, just flip the sign, if needed.
                 temp *= neg;
 
                 // Time to update!
                 Bucket bucket = buckets[nameHash & (MAP_SIZE - 1)];
                 if ((bucket != null) && (nameHash == bucket.hash) && bucket.matches(slice, nameBegin, nameEnd)) {
-                    // Lucky fast path, existing bucket hit. Most of the time we complete here.
+                    // Lucky fast path: matching bucket hit. Most of the time we complete here.
                     bucket.merge(temp);
                 }
                 else {
@@ -475,9 +516,8 @@ private void seqCompute(MeasurementsMap map, ByteBuffer origSlice, int length) t
     // task and let it split, but unfortunately buffer API does not allow us
     // "long" start-s and length-s. So we have to chunk at least by mmap-ed
     // size first. It is a CountedCompleter for the same reason ParsingTask is.
-    // This also gives us a very nice opportunity to complete the work on
-    // a given mmap slice, while there is still other work to do. This allows
-    // us to unmap slices on the go.
+    // This also gives us a very nice opportunity to process mmap-ed chunks
+    // one by one, thus allowing incremental unmmaps.
     public static final class RootTask extends CountedCompleter<Void> {
         public RootTask() {
             super(null);
@@ -516,7 +556,7 @@ private void internalCompute() throws Exception {
                 }
                 end = minEnd + w;
 
-                // Fork out the large slice
+                // Fork out the large slice.
                 long len = end - start;
                 MappedByteBuffer slice = fc.map(FileChannel.MapMode.READ_ONLY, start, len);
                 start += len;
@@ -524,7 +564,7 @@ private void internalCompute() throws Exception {
                 // FJP API: Announce we have a pending task before forking.
                 addToPendingCount(1);
 
-                // ...and fork it
+                // ...and fork it!
                 new ParsingTask(this, slice).fork();
             }
 
@@ -537,6 +577,9 @@ private void internalCompute() throws Exception {
 
         @Override
         public void onCompletion(CountedCompleter<?> caller) {
+            // FJP API: This would be called when root task completes along with
+            // all subtasks. This means the processing is done, we can go and
+            // tell main thread about that.
             try {
                 RELEASABLE_BUFFERS.put(RELEASABLE_BUFFER_POISON_PILL);
             }
@@ -558,7 +601,8 @@ public static void main(String[] args) throws Exception {
 
         // While the root task is working, prepare what we need for the
         // end of the run. Go and try to report something to prepare the
-        // reporting code for execution.
+        // reporting code for execution. This prepares classes, storage,
+        // and some profiles for eventual execution.
         MeasurementsMap map = new MeasurementsMap();
         Row[] rows = new Row[MAP_SIZE];
         StringBuilder sb = new StringBuilder(16384);

From a8823a1f93e63999dfeec159cdfa21eac43a9cab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20L=C3=B6vdahl?= <slovdahl@hibox.fi>
Date: Wed, 31 Jan 2024 22:37:50 +0200
Subject: [PATCH 233/268] slovdahl's submission (#691)

* slovdahl: First submission

* More JAVA_OPTS flags, 0.1s better locally
---
 calculate_average_slovdahl.sh                 |  22 ++
 prepare_slovdahl.sh                           |  22 ++
 .../onebrc/CalculateAverage_slovdahl.java     | 278 ++++++++++++++++++
 3 files changed, 322 insertions(+)
 create mode 100755 calculate_average_slovdahl.sh
 create mode 100755 prepare_slovdahl.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_slovdahl.java

diff --git a/calculate_average_slovdahl.sh b/calculate_average_slovdahl.sh
new file mode 100755
index 000000000..3f99dc03e
--- /dev/null
+++ b/calculate_average_slovdahl.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="${JAVA_OPTS} --enable-preview -XX:+UnlockExperimentalVMOptions -XX:+UnlockDiagnosticVMOptions"
+JAVA_OPTS="${JAVA_OPTS} -Xmx8g -Xms8g"
+JAVA_OPTS="${JAVA_OPTS} -XX:+TrustFinalNonStaticFields -XX:-UseCompressedOops"
+
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_slovdahl
diff --git a/prepare_slovdahl.sh b/prepare_slovdahl.sh
new file mode 100755
index 000000000..52791308f
--- /dev/null
+++ b/prepare_slovdahl.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+
+sdk use java 21.0.2-tem 1>&2 > /dev/null
+./mvnw verify
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_slovdahl.java b/src/main/java/dev/morling/onebrc/CalculateAverage_slovdahl.java
new file mode 100644
index 000000000..d22409177
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_slovdahl.java
@@ -0,0 +1,278 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.channels.FileChannel;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringJoiner;
+import java.util.TreeMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import static java.util.stream.Collectors.collectingAndThen;
+import static java.util.stream.Collectors.groupingBy;
+import static java.util.stream.Collectors.reducing;
+
+public class CalculateAverage_slovdahl {
+
+    private static final String FILE = "./measurements.txt";
+
+    private static final int SLICE_SIZE = 1_048_576;
+
+    public static void main(String[] args) throws IOException, ExecutionException, InterruptedException {
+        int segments = Runtime.getRuntime().availableProcessors() - 1;
+
+        try (Arena arena = Arena.ofShared();
+                FileChannel channel = FileChannel.open(Paths.get(FILE), StandardOpenOption.READ);
+                ExecutorService executor = Executors.newThreadPerTaskExecutor(Executors.defaultThreadFactory())) {
+
+            long size = channel.size();
+            if (size < SLICE_SIZE) {
+                segments = 1;
+            }
+
+            long idealSegmentSize = size / segments;
+
+            MemorySegment mappedFile = channel.map(FileChannel.MapMode.READ_ONLY, 0, size, arena);
+            var futures = new ArrayList<Future<Map<Station, MeasurementAggregator>>>(segments);
+
+            long segmentStart = 0;
+            for (int i = 1; i <= segments; i++) {
+                long actualSegmentOffset = idealSegmentSize * i;
+
+                while (actualSegmentOffset < size && mappedFile.get(ValueLayout.JAVA_BYTE, actualSegmentOffset) != (byte) '\n') {
+                    actualSegmentOffset++;
+                }
+
+                long end = actualSegmentOffset - segmentStart;
+                if (segmentStart + actualSegmentOffset - segmentStart + 1 < size) {
+                    end += 1;
+                }
+
+                MemorySegment segment = mappedFile.asSlice(segmentStart, end);
+                segmentStart = actualSegmentOffset + 1;
+
+                futures.add(executor.submit(() -> {
+                    byte[] array = new byte[SLICE_SIZE];
+                    MemorySegment bufferSegment = MemorySegment.ofArray(array);
+
+                    long position = 0;
+                    long segmentSize = segment.byteSize();
+                    Map<Station, MeasurementAggregator> map = HashMap.newHashMap(10_000);
+
+                    while (position < segmentSize) {
+                        long thisSliceSize = Math.min(SLICE_SIZE, segmentSize - position);
+
+                        MemorySegment.copy(
+                                segment,
+                                ValueLayout.JAVA_BYTE,
+                                position,
+                                bufferSegment,
+                                ValueLayout.JAVA_BYTE,
+                                0,
+                                thisSliceSize);
+
+                        if (thisSliceSize % 8 != 0) {
+                            bufferSegment
+                                    .asSlice(thisSliceSize)
+                                    .fill((byte) 0);
+                        }
+
+                        int newlinePosition = 0;
+                        int startOffset = 0;
+                        while (true) {
+                            int semicolonPosition = nextOccurrence(array, (byte) ';', startOffset);
+                            if (semicolonPosition < 0) {
+                                break;
+                            }
+
+                            int eolPosition = nextOccurrence(array, (byte) '\n', startOffset);
+                            if (eolPosition < 0) {
+                                if (semicolonPosition < segmentSize - 4) {
+                                    break;
+                                }
+                                else {
+                                    newlinePosition = (int) segmentSize;
+                                }
+                            }
+                            else {
+                                newlinePosition = eolPosition;
+                            }
+
+                            byte[] nameArray = new byte[semicolonPosition - startOffset];
+                            System.arraycopy(array, startOffset, nameArray, 0, semicolonPosition - startOffset);
+                            Station station = new Station(nameArray);
+
+                            int temperatureStart = semicolonPosition + 1;
+                            int temperatureLength = newlinePosition - semicolonPosition - 1;
+
+                            int temperatureIntValue;
+                            if (array[temperatureStart] == '-') {
+                                if (temperatureLength == 4) {
+                                    temperatureIntValue = -1 * ((array[temperatureStart + 1] - 48) * 10 +
+                                            (array[temperatureStart + 3] - 48));
+                                }
+                                else {
+                                    temperatureIntValue = -1 * ((array[temperatureStart + 1] - 48) * 100 +
+                                            (array[temperatureStart + 2] - 48) * 10 +
+                                            (array[temperatureStart + 4] - 48));
+                                }
+                            }
+                            else {
+                                if (temperatureLength == 3) {
+                                    temperatureIntValue = (array[temperatureStart] - 48) * 10 +
+                                            (array[temperatureStart + 2] - 48);
+                                }
+                                else {
+                                    temperatureIntValue = (array[temperatureStart] - 48) * 100 +
+                                            (array[temperatureStart + 1] - 48) * 10 +
+                                            (array[temperatureStart + 3] - 48);
+                                }
+                            }
+
+                            MeasurementAggregator agg = map.get(station);
+                            if (agg == null) {
+                                agg = new MeasurementAggregator();
+                                map.put(station, agg);
+                            }
+
+                            agg.min = Math.min(agg.min, temperatureIntValue);
+                            agg.max = Math.max(agg.max, temperatureIntValue);
+                            agg.sum += temperatureIntValue;
+                            agg.count++;
+
+                            // Make sure the next iteration won't find the same delimiters.
+                            array[semicolonPosition] = (byte) 0;
+                            array[newlinePosition] = (byte) 0;
+
+                            startOffset = newlinePosition + 1;
+                        }
+
+                        position += newlinePosition + 1;
+                    }
+
+                    return map;
+                }));
+            }
+
+            TreeMap<String, ResultRow> result = futures.stream()
+                    .map(f -> {
+                        try {
+                            return f.get();
+                        }
+                        catch (InterruptedException | ExecutionException e) {
+                            throw new RuntimeException(e);
+                        }
+                    })
+                    .flatMap(m -> m.entrySet().stream())
+                    .collect(groupingBy(
+                            e -> new String(e.getKey().name()),
+                            TreeMap::new,
+                            collectingAndThen(
+                                    reducing(
+                                            new MeasurementAggregator(),
+                                            Map.Entry::getValue,
+                                            (agg1, agg2) -> {
+                                                MeasurementAggregator res = new MeasurementAggregator();
+                                                res.min = Math.min(agg1.min, agg2.min);
+                                                res.max = Math.max(agg1.max, agg2.max);
+                                                res.sum = agg1.sum + agg2.sum;
+                                                res.count = agg1.count + agg2.count;
+
+                                                return res;
+                                            }),
+                                    agg -> new ResultRow(
+                                            agg.min / 10.0,
+                                            (Math.round((agg.sum / 10.0) * 10.0) / 10.0) / agg.count,
+                                            agg.max / 10.0))));
+
+            System.out.println(result);
+
+            executor.shutdownNow();
+        }
+    }
+
+    private static int nextOccurrence(byte[] data, byte needle, int offset) {
+        while (offset < data.length) {
+            if (data[offset] == needle) {
+                return offset;
+            }
+            offset++;
+        }
+        return -1;
+    }
+
+    private record Station(byte[] name, int hash) {
+        private Station(byte[] name) {
+            this(name, Arrays.hashCode(name));
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+            Station station = (Station) o;
+            return Arrays.equals(name, station.name);
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
+        }
+
+        @Override
+        public String toString() {
+            return new StringJoiner(", ", Station.class.getSimpleName() + "[", "]")
+                    .add("name=" + new String(name))
+                    .add("hash=" + hash)
+                    .toString();
+        }
+    }
+
+    private static class MeasurementAggregator {
+        private int min = Integer.MAX_VALUE;
+        private int max = Integer.MIN_VALUE;
+        private long sum;
+        private long count;
+    }
+
+    private record ResultRow(double min, double mean, double max) {
+
+        @Override
+        public String toString() {
+            return round(min) + "/" + round(mean) + "/" + round(max);
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+}

From a533019ad4b5fbd695bf569322b669d103065042 Mon Sep 17 00:00:00 2001
From: Stephen Von Worley <137962273+stephenvonworley@users.noreply.github.com>
Date: Wed, 31 Jan 2024 12:49:19 -0800
Subject: [PATCH 234/268] CalculateAverage_stephenvonworley submission (#677)

* first release

* change constants to names

---------

Co-authored-by: Stephen Von Worley <von@von.io>
---
 calculate_average_stephenvonworley.sh         |  25 +
 prepare_stephenvonworley.sh                   |  25 +
 .../CalculateAverage_stephenvonworley.java    | 530 ++++++++++++++++++
 3 files changed, 580 insertions(+)
 create mode 100755 calculate_average_stephenvonworley.sh
 create mode 100755 prepare_stephenvonworley.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java

diff --git a/calculate_average_stephenvonworley.sh b/calculate_average_stephenvonworley.sh
new file mode 100755
index 000000000..2fca19ffa
--- /dev/null
+++ b/calculate_average_stephenvonworley.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+if [ -f target/CalculateAverage_stephenvonworley_image ]; then
+    target/CalculateAverage_stephenvonworley_image
+else
+    JAVA_OPTS="--enable-preview"
+    echo "Chosing to run the app in JVM mode as no native image was found, use prepare_stephenvonworley.sh to generate." 1>&2
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_stephenvonworley
+fi
+
diff --git a/prepare_stephenvonworley.sh b/prepare_stephenvonworley.sh
new file mode 100755
index 000000000..4e8d22511
--- /dev/null
+++ b/prepare_stephenvonworley.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_stephenvonworley_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -H:TuneInlinerExploration=1 -march=native --enable-preview --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_stephenvonworley"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_stephenvonworley_image dev.morling.onebrc.CalculateAverage_stephenvonworley
+fi
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java b/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java
new file mode 100644
index 000000000..a51b24d71
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java
@@ -0,0 +1,530 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.*;
+import java.lang.foreign.*;
+import java.lang.reflect.Field;
+import java.nio.*;
+import java.nio.channels.*;
+import java.nio.file.*;
+import java.nio.charset.*;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.stream.*;
+import sun.misc.Unsafe;
+
+/*
+ * Stephen Von Worley's (von@von.io) entry to Gunnar Morling's "One Billion Row Challenge":
+ * https://www.morling.dev/blog/one-billion-row-challenge/
+ *
+ * To compute the desired result, this program:
+ * 1. Memory maps the input file.
+ * 2. Partitions the file into a queue of Chunks, which delimit sections of the file.
+ * 3. Spawns one thread per processor. Each thread:
+ *    a. Allocates a Table, which will accumulate names and tallies (min/max/total/count).
+ *    b. Get a Chunk from the queue.
+ *    c. Processes the Chunk using a parser that reads the Chunk simultaneously at three
+ *       different, evenly-spaced locations, using heavily-optimized scalar code.
+ *    d. Repeats steps b and c until there are no more Chunks.
+ * 4. Aggregates the resulting Tables into a treemap of names to Tallies.
+ * 5. Outputs the names and Tallies in ascending name order.
+ *
+ * Runs fastest as a natively-compiled, standalone binary, as might be produced by Graal's
+ * `native-image` utility.  Tested with Oracle Graal 21.0.2.
+ * 
+ * Incorporates code authored by a number of submitters, including Thomas Wue, Quan Anh
+ * Mai, and others.
+ *
+ * Thanks y'all, and Happy Rowing!
+ * Steve
+ * von@von.io
+ * www.von.io
+ */
+
+public class CalculateAverage_stephenvonworley {
+
+    private static final int NAME_LIMIT = 10000;
+
+    private static final long CHUNK_SIZE = 5000000;
+    private static final long CHUNK_PAD = 200;
+    private static final long CHUNK_PARSE3_LIMIT = 1000;
+
+    private static final long GOLDEN_LONG = 0x9e3779b97f4a7c15L;
+    private static final long TALLY_BITS = 7;
+    private static final long TALLY_SIZE = 1L << TALLY_BITS;
+    private static final long HASH_BITS = 16;
+    private static final long HASH_MASK = ((1L << HASH_BITS) - 1) << TALLY_BITS;
+    private static final long TABLE_SIZE = 1L << (HASH_BITS + TALLY_BITS);
+
+    private static final long OFFSET_MIN = 0;
+    private static final long OFFSET_MAX = 2;
+    private static final long OFFSET_COUNT = 4;
+    private static final long OFFSET_TOTAL = 8;
+    private static final long OFFSET_LEN = 16;
+    private static final long OFFSET_NAME = 17;
+
+    private static final Unsafe unsafe;
+    static {
+        try {
+            Field f = Unsafe.class.getDeclaredField("theUnsafe");
+            f.setAccessible(true);
+            unsafe = (Unsafe) f.get(null);
+        }
+        catch (Exception e) {
+            throw new RuntimeException("Exception initializing unsafe", e);
+        }
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        if (!List.of(args).contains("--worker")) {
+            spawnWorker();
+            return;
+        }
+
+        MemorySegment in = map("./measurements.txt");
+        Queue<Chunk> chunks = partition(in);
+        List<Table> tables = process(chunks, processorCount());
+        Map<String, Tally> nameToTally = aggregate(tables);
+
+        System.out.println(nameToTally);
+        System.out.close();
+    }
+
+    // credit: "Spawn worker" code by Thomas Wue
+    private static void spawnWorker() throws IOException {
+        ProcessHandle.Info info = ProcessHandle.current().info();
+        ArrayList<String> workerCommand = new ArrayList<>();
+        info.command().ifPresent(workerCommand::add);
+        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
+        workerCommand.add("--worker");
+        new ProcessBuilder().command(workerCommand).inheritIO().redirectOutput(ProcessBuilder.Redirect.PIPE)
+                .start().getInputStream().transferTo(System.out);
+    }
+
+    private static int processorCount() {
+        return Runtime.getRuntime().availableProcessors();
+    }
+
+    private static MemorySegment map(String path) throws IOException {
+        FileChannel file = FileChannel.open(Path.of(path), StandardOpenOption.READ);
+        return file.map(FileChannel.MapMode.READ_ONLY, 0, file.size(), Arena.global());
+    }
+
+    private static MemorySegment allocate(long len) {
+        return Arena.global().allocate(len, 4096);
+    }
+
+    private static Queue<Chunk> partition(MemorySegment in) throws IOException {
+        Queue<Chunk> chunks = new ConcurrentLinkedDeque<>();
+        long address = in.address();
+        long len = in.byteSize();
+        long start = address;
+        while (start < address + len) {
+            long end = start + CHUNK_SIZE;
+            if (end >= address + len) {
+                end = address + len;
+            }
+            else {
+                end = afterNewline(end);
+            }
+            Chunk chunk;
+            if (end + CHUNK_PAD < address + len) {
+                chunk = new Chunk(start, end);
+            }
+            else {
+                MemorySegment padded = allocate(end - start + CHUNK_PAD);
+                MemorySegment.copy(in, start - address, padded, 0, end - start);
+                chunk = new Chunk(padded.address(), padded.address() + (end - start));
+            }
+            chunks.offer(chunk);
+            start = end;
+        }
+        return chunks;
+    }
+
+    private static List<Table> process(Queue<Chunk> chunks, int threadCount) throws InterruptedException {
+        List<Table> tables = Collections.synchronizedList(new ArrayList<>(threadCount));
+        List<Thread> threads = new ArrayList<>(threadCount);
+        for (int i = 0; i < threadCount; i++) {
+            Thread thread = new Thread(() -> {
+                Table t = new Table();
+                tables.add(t);
+                Chunk chunk;
+                while ((chunk = chunks.poll()) != null) {
+                    parse3(chunk.start(), chunk.end(), t);
+                }
+            });
+            threads.add(thread);
+            thread.start();
+        }
+        for (Thread thread : threads) {
+            thread.join();
+        }
+        return tables;
+    }
+
+    private static Map<String, Tally> aggregate(List<Table> tables) {
+        Map<String, Tally> nameToTally = new TreeMap<>();
+        tables.forEach(table -> aggregate(nameToTally, table));
+        return nameToTally;
+    }
+
+    private static void aggregate(Map<String, Tally> nameToTally, Table table) {
+        table.process((name, min, max, total, count) -> nameToTally.computeIfAbsent(name, _ -> new Tally()).add(min, max, total, count));
+    }
+
+    private static void parse3(long start, long end, Table table) {
+
+        if (end - start < CHUNK_PARSE3_LIMIT) {
+            parse1(start, end, table);
+            return;
+        }
+
+        final long tallies = table.tallies;
+
+        long part = (end - start) / 3;
+        long startA = start;
+        long startB = afterNewline(start + part);
+        long startC = afterNewline(start + 2 * part);
+        long endA = startB;
+        long endB = startC;
+        long endC = end;
+
+        while (true) {
+            long N = min(
+                    remaining(startA, endA),
+                    remaining(startB, endB),
+                    remaining(startC, endC));
+
+            if (N <= 1) {
+                break;
+            }
+
+            while (N > 0) {
+                long semicolonA = semicolon(startA);
+                long semicolonB = semicolon(startB);
+                long semicolonC = semicolon(startC);
+
+                long tallyA = locate(startA, semicolonA, tallies, table);
+                long tallyB = locate(startB, semicolonB, tallies, table);
+                long tallyC = locate(startC, semicolonC, tallies, table);
+
+                long numberA = number(semicolonA);
+                tally(tallyA, numberA);
+                long numberB = number(semicolonB);
+                tally(tallyB, numberB);
+                long numberC = number(semicolonC);
+                tally(tallyC, numberC);
+
+                startA = next(semicolonA);
+                startB = next(semicolonB);
+                startC = next(semicolonC);
+                N--;
+            }
+        }
+
+        parse1(startA, endA, table);
+        parse1(startB, endB, table);
+        parse1(startC, endC, table);
+    }
+
+    private static void parse1(long start, long end, Table table) {
+        final long tallies = table.tallies;
+
+        while (start < end) {
+            long semicolon = semicolon(start);
+            long tally = locate(start, semicolon, tallies, table);
+            long number = number(semicolon);
+            tally(tally, number);
+            start = next(semicolon);
+        }
+    }
+
+    private static long remaining(long start, long end) {
+        return (end - start) >> 7;
+    }
+
+    // credit: Adapted from code by Thomas Wue
+    private static long semicolon(long start) {
+        start++;
+        long word = getLong(start);
+        long input = word ^ 0x3B3B3B3B3B3B3B3BL;
+        long tmp = (input - 0x0101010101010101L) & ~input & 0x8080808080808080L;
+        if (tmp != 0) {
+            return start + (Long.numberOfTrailingZeros(tmp) >>> 3);
+        }
+        while (true) {
+            start += 8;
+            long word2 = getLong(start);
+            long input2 = word2 ^ 0x3B3B3B3B3B3B3B3BL;
+            long tmp2 = (input2 - 0x0101010101010101L) & ~input2 & 0x8080808080808080L;
+            if (tmp2 != 0) {
+                return start + (Long.numberOfTrailingZeros(tmp2) >>> 3);
+            }
+        }
+    }
+
+    private static long trim(long value, long remove) {
+        long shift = remove << 3;
+        return ((value << shift) >>> shift);
+    }
+
+    // https://softwareengineering.stackexchange.com/questions/402542/where-do-magic-hashing-constants-like-0x9e3779b9-and-0x9e3779b1-come-from
+    private static long locate(long start, long semicolon, long tallies, Table table) {
+        long len = semicolon - start;
+        long word = getLong(start);
+        if (len <= 8) {
+            word = trim(word, 8 - len);
+            long hash = word * GOLDEN_LONG;
+            long offset = (hash >>> (64 - HASH_BITS)) << TALLY_BITS;
+            while (true) {
+                long tally = tallies + offset;
+                long tlen = getByte(tally + OFFSET_LEN);
+                long tword = getLong(tally + OFFSET_NAME);
+                if (len == tlen && word == tword) {
+                    return tally;
+                }
+                if (tword == 0) {
+                    init(tally, start, len, table);
+                    return tally;
+                }
+                offset = (offset + TALLY_SIZE) & HASH_MASK;
+            }
+        }
+        else {
+            long word2 = getLong(semicolon - 8);
+            long hash = (word + word2) * GOLDEN_LONG;
+            long offset = (hash >>> (64 - HASH_BITS)) << TALLY_BITS;
+            while (true) {
+                long tally = tallies + offset;
+                long tword = getLong(tally + OFFSET_NAME);
+                if (len <= 16) {
+                    long tlen = getByte(tally + OFFSET_LEN);
+                    long tword2 = getLong(tally + OFFSET_NAME + len - 8);
+                    if (len == tlen && word == tword && word2 == tword2) {
+                        return tally;
+                    }
+                }
+                else {
+                    if (match(tally, start, len)) {
+                        return tally;
+                    }
+                }
+                if (tword == 0) {
+                    init(tally, start, len, table);
+                    return tally;
+                }
+                offset = (offset + TALLY_SIZE) & HASH_MASK;
+            }
+        }
+    }
+
+    private static void init(long tally, long start, long len, Table t) {
+        setShort(tally + OFFSET_MIN, Short.MAX_VALUE);
+        setShort(tally + OFFSET_MAX, Short.MIN_VALUE);
+        setByte(tally + OFFSET_LEN, (byte) len);
+        copyMemory(start, tally + OFFSET_NAME, len);
+        t.addresses[t.count++] = tally;
+    }
+
+    private static boolean match(long tally, long name, long len) {
+        if (getByte(tally + OFFSET_LEN) != len) {
+            return false;
+        }
+        long a = name;
+        long b = tally + OFFSET_NAME;
+        while (len > 7) {
+            if (getLong(a) != getLong(b)) {
+                return false;
+            }
+            a += 8;
+            b += 8;
+            len -= 8;
+        }
+        if (len > 0) {
+            return (trim(getLong(a), 8 - len) == getLong(b));
+        }
+        return true;
+    }
+
+    // credit: Wonderfully-fast number parsing implementation by Quan Anh Mai
+    private static long number(long semicolon) {
+        long numberWord = getLong(semicolon + 1);
+        int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
+        int shift = 28 - decimalSepPos;
+        // signed is -1 if negative, 0 otherwise
+        long signed = (~numberWord << 59) >> 63;
+        long designMask = ~(signed & 0xFF);
+        // Align the number to a specific position and transform the ascii to digit value
+        long digits = ((numberWord & designMask) << shift) & 0x0F000F0F00L;
+        // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+        // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+        // 0x000000UU00TTHH00 + 0x00UU00TTHH000000 * 10 + 0xUU00TTHH00000000 * 100
+        long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+        return (absValue ^ signed) - signed;
+    }
+
+    private static void tally(long tally, long number) {
+        short min = getShort(tally + OFFSET_MIN);
+        short max = getShort(tally + OFFSET_MAX);
+        int count = getInt(tally + OFFSET_COUNT);
+        long total = getLong(tally + OFFSET_TOTAL);
+        if (number < min) {
+            setShort(tally + OFFSET_MIN, (short) number);
+        }
+        if (number > max) {
+            setShort(tally + OFFSET_MAX, (short) number);
+        }
+        setInt(tally + OFFSET_COUNT, count + 1);
+        setLong(tally + OFFSET_TOTAL, total + number);
+    }
+
+    private static long next(long semicolon) {
+        long word = getLong(semicolon);
+        semicolon += 7;
+        semicolon -= (~word >>> (24 + 4)) & 1;
+        semicolon -= (~word >>> (16 + 4 - 1)) & 2;
+        return semicolon;
+    }
+
+    private static long afterNewline(long start) {
+        while (getByte(start) != '\n')
+            start++;
+        return start + 1;
+    }
+
+    private static long min(long a, long b, long c) {
+        return Math.min(a, Math.min(b, c));
+    }
+
+    private static byte getByte(long addr) {
+        return unsafe.getByte(addr);
+    }
+
+    private static short getShort(long addr) {
+        return unsafe.getShort(addr);
+    }
+
+    private static int getInt(long addr) {
+        return unsafe.getInt(addr);
+    }
+
+    private static long getLong(long addr) {
+        return unsafe.getLong(addr);
+    }
+
+    private static void setByte(long addr, byte value) {
+        unsafe.putByte(addr, value);
+    }
+
+    private static void setShort(long addr, short value) {
+        unsafe.putShort(addr, value);
+    }
+
+    private static void setInt(long addr, int value) {
+        unsafe.putInt(addr, value);
+    }
+
+    private static void setLong(long addr, long value) {
+        unsafe.putLong(addr, value);
+    }
+
+    private static void copyMemory(long srcAddr, long dstAddr, long count) {
+        unsafe.copyMemory(srcAddr, dstAddr, count);
+    }
+
+    private static record Chunk(long start, long end) {
+    }
+
+    private static class Table {
+        public final long tallies;
+        public final long[] addresses;
+        public int count;
+
+        public Table() {
+            tallies = allocate(TABLE_SIZE).address();
+            addresses = new long[NAME_LIMIT];
+            count = 0;
+        }
+
+        public void process(Consumer consumer) {
+            for (int i = 0; i < count; i++) {
+                long address = addresses[i];
+                int len = getByte(address + OFFSET_LEN);
+                byte[] bytes = new byte[len];
+                for (int j = 0; j < len; j++) {
+                    bytes[j] = getByte(address + OFFSET_NAME + j);
+                }
+                String name = new String(bytes, StandardCharsets.UTF_8);
+                long min = getShort(address + OFFSET_MIN);
+                long max = getShort(address + OFFSET_MAX);
+                long total = getLong(address + OFFSET_TOTAL);
+                long count = getInt(address + OFFSET_COUNT);
+                consumer.consume(name, min, max, total, count);
+            }
+        }
+    }
+
+    private static interface Consumer {
+        public void consume(String name, long min, long max, long total, long count);
+    }
+
+    private static class Tally {
+
+        private long min;
+        private long max;
+        private long total;
+        private long count;
+
+        public Tally() {
+            this.min = Short.MAX_VALUE;
+            this.max = Short.MIN_VALUE;
+            this.total = 0;
+            this.count = 0;
+        }
+
+        public void add(long addMin, long addMax, long addTotal, long addCount) {
+            min = Math.min(min, addMin);
+            max = Math.max(max, addMax);
+            total += addTotal;
+            count += addCount;
+        }
+
+        public long getMin() {
+            return min;
+        }
+
+        public long getMax() {
+            return max;
+        }
+
+        public long getTotal() {
+            return total;
+        }
+
+        public long getCount() {
+            return count;
+        }
+
+        public String toString() {
+            return String.format("%.1f/%.1f/%.1f",
+                    getMin() / 10.0,
+                    getTotal() / (10.0 * getCount()),
+                    getMax() / 10.0);
+        }
+    }
+}

From 33143cdbb81581228f32947e4a50951358331b7c Mon Sep 17 00:00:00 2001
From: Breejesh Rathod <breejesh2212@gmail.com>
Date: Thu, 1 Feb 2024 02:22:40 +0530
Subject: [PATCH 235/268] breejesh Submission (#670)

* 1BRC breejesh

* Fix output

* Fix formatting

* Format and remove preview feature

* Optimize merge

* Revert "Optimize merge"

This reverts commit 28c9b4af29e1c90e992e8a1fd4f3258895782c2c.

---------

Co-authored-by: Breejesh Rathod <breejesh.rathod@m2pfintech.com>
---
 calculate_average_breejesh.sh                 |  19 ++
 prepare_breejesh.sh                           |  20 ++
 .../onebrc/CalculateAverage_breejesh.java     | 180 ++++++++++++++++++
 3 files changed, 219 insertions(+)
 create mode 100755 calculate_average_breejesh.sh
 create mode 100755 prepare_breejesh.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_breejesh.java

diff --git a/calculate_average_breejesh.sh b/calculate_average_breejesh.sh
new file mode 100755
index 000000000..0f0738b2b
--- /dev/null
+++ b/calculate_average_breejesh.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS=""
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_breejesh
diff --git a/prepare_breejesh.sh b/prepare_breejesh.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_breejesh.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_breejesh.java b/src/main/java/dev/morling/onebrc/CalculateAverage_breejesh.java
new file mode 100644
index 000000000..3ee87c943
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_breejesh.java
@@ -0,0 +1,180 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+public class CalculateAverage_breejesh {
+    private static final String FILE = "./measurements.txt";
+    private static final int TWO_BYTE_TO_INT = 480 + 48; // 48 is the ASCII code for '0'
+    private static final int THREE_BYTE_TO_INT = 4800 + 480 + 48;
+
+    private static final class Measurement {
+
+        private int min;
+        private int max;
+        private int total;
+        private int count;
+
+        public Measurement(int value) {
+            this.min = value;
+            this.max = value;
+            this.total = value;
+            this.count = 1;
+        }
+
+        @Override
+        public String toString() {
+            StringBuilder result = new StringBuilder();
+            result.append(min / 10.0);
+            result.append("/");
+            result.append(Math.round(((double) total) / count) / 10.0);
+            result.append("/");
+            result.append(max / 10.0);
+            return result.toString();
+        }
+
+        private void append(int min, int max, int total, int count) {
+            if (min < this.min)
+                this.min = min;
+            if (max > this.max)
+                this.max = max;
+            this.total += total;
+            this.count += count;
+        }
+
+        public void append(int value) {
+            append(value, value, value, 1);
+        }
+
+        public void merge(Measurement other) {
+            append(other.min, other.max, other.total, other.count);
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        // long start = System.currentTimeMillis();
+        // Find system details to determine cores and
+        var file = new File(args.length > 0 ? args[0] : FILE);
+        long fileSize = file.length();
+        var numberOfCores = fileSize > 1_000_000 ? Runtime.getRuntime().availableProcessors() : 1;
+        var splitSectionSize = (int) Math.min(Integer.MAX_VALUE, fileSize / numberOfCores); // bytebuffer position is an int, so can be max Integer.MAX_VALUE
+        var segmentCount = (int) (fileSize / splitSectionSize);
+
+        // Divide file into segments
+        ExecutorService executor = Executors.newFixedThreadPool(segmentCount);
+        List<CompletableFuture<Map<String, Measurement>>> futures = new ArrayList<>();
+        for (int i = 0; i < segmentCount; i++) {
+            long sectionStart = i * (long) splitSectionSize;
+            long sectionEnd = Math.min(fileSize, sectionStart + splitSectionSize + 100);
+            var fileChannel = (FileChannel) Files.newByteChannel(file.toPath(), StandardOpenOption.READ);
+            CompletableFuture<Map<String, Measurement>> future = CompletableFuture.supplyAsync(() -> {
+                MappedByteBuffer currentBuffer = null;
+                try {
+                    currentBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, sectionStart, sectionEnd - sectionStart);
+                }
+                catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+                // Skip till new line for unequal segments, not to be done for first section
+                if (sectionStart > 0) {
+                    while (currentBuffer.get() != '\n')
+                        ;
+                }
+                Map<String, Measurement> map = new HashMap<>();
+                while (currentBuffer.position() < splitSectionSize) {
+                    // Read station
+                    String str = getStationFromBuffer(currentBuffer);
+                    // Read number
+                    int value = getValueFromBuffer(currentBuffer);
+                    if (map.containsKey(str)) {
+                        map.get(str).append(value);
+                    }
+                    else {
+                        map.put(str, new Measurement(value));
+                    }
+                }
+                return map;
+            }, executor);
+            futures.add(future);
+        }
+
+        CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
+        Map<String, Measurement> finalMap = new TreeMap<>();
+        for (CompletableFuture<Map<String, Measurement>> future : futures) {
+            Map<String, Measurement> map = future.get();
+            map.keySet().stream().forEach(
+                    key -> {
+                        if (finalMap.containsKey(key)) {
+                            finalMap.get(key).merge(map.get(key));
+                        }
+                        else {
+                            finalMap.put(key, map.get(key));
+                        }
+                    });
+        }
+
+        System.out.println(finalMap);
+        // System.out.printf("Time %s", System.currentTimeMillis() - start);
+        System.exit(0);
+    }
+
+    private static String getStationFromBuffer(MappedByteBuffer currentBuffer) {
+        byte currentByte;
+        var byteCounter = 0;
+        var buffer = new byte[100];
+        while ((currentByte = currentBuffer.get()) != ';') {
+            buffer[byteCounter++] = currentByte;
+        }
+        return new String(buffer, 0, byteCounter, StandardCharsets.UTF_8);
+    }
+
+    private static int getValueFromBuffer(MappedByteBuffer currentBuffer) {
+        int value;
+        byte[] nums = new byte[4];
+        currentBuffer.get(nums);
+        if (nums[1] == '.') {
+            // case of n.n
+            value = (nums[0] * 10 + nums[2] - TWO_BYTE_TO_INT);
+        }
+        else {
+            if (nums[3] == '.') {
+                // case of -nn.n
+                value = -(nums[1] * 100 + nums[2] * 10 + currentBuffer.get() - THREE_BYTE_TO_INT);
+            }
+            else if (nums[0] == '-') {
+                // case of -n.n
+                value = -(nums[1] * 10 + nums[3] - TWO_BYTE_TO_INT);
+            }
+            else {
+                // case of nn.n
+                value = (nums[0] * 100 + nums[1] * 10 + nums[3] - THREE_BYTE_TO_INT);
+            }
+            currentBuffer.get(); // new line
+        }
+        return value;
+    }
+}

From b91c95a498c5959ae391c7ad4fdeb2162e31b73d Mon Sep 17 00:00:00 2001
From: Sudhir Tumati <mailofsudhir@gmail.com>
Date: Thu, 1 Feb 2024 04:57:32 +0800
Subject: [PATCH 236/268] sudhirtumati implementation (#598)

---
 calculate_average_sudhirtumati.sh             |  19 ++
 prepare_sudhirtumati.sh                       |  20 ++
 .../onebrc/CalculateAverage_sudhirtumati.java | 304 ++++++++++++++++++
 3 files changed, 343 insertions(+)
 create mode 100755 calculate_average_sudhirtumati.sh
 create mode 100755 prepare_sudhirtumati.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_sudhirtumati.java

diff --git a/calculate_average_sudhirtumati.sh b/calculate_average_sudhirtumati.sh
new file mode 100755
index 000000000..fb31f8672
--- /dev/null
+++ b/calculate_average_sudhirtumati.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_sudhirtumati
diff --git a/prepare_sudhirtumati.sh b/prepare_sudhirtumati.sh
new file mode 100755
index 000000000..735bdab4c
--- /dev/null
+++ b/prepare_sudhirtumati.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-open 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_sudhirtumati.java b/src/main/java/dev/morling/onebrc/CalculateAverage_sudhirtumati.java
new file mode 100644
index 000000000..813c561a5
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_sudhirtumati.java
@@ -0,0 +1,304 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Semaphore;
+
+public class CalculateAverage_sudhirtumati {
+
+    private static final String FILE = "./measurements.txt";
+    private static final int bufferSize = 8192;
+    private static final byte SEMICOLON = (byte) ';';
+    private static final byte NEW_LINE = (byte) '\n';
+    private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
+    private static final Semaphore PERMITS = new Semaphore(THREAD_COUNT);
+    private static final MeasurementAggregator globalAggregator = new MeasurementAggregator();
+    private static final Semaphore AGGREGATOR_PERMITS = new Semaphore(1);
+    private static final Map<Integer, String> LOCATION_STORE = new ConcurrentHashMap<>();
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        CalculateAverage_sudhirtumati instance = new CalculateAverage_sudhirtumati();
+        instance.chunkProcess();
+    }
+
+    private void chunkProcess() throws IOException, InterruptedException {
+        try (FileInputStream is = new FileInputStream(FILE);
+             FileChannel fc = is.getChannel()) {
+            for (int i = 0; i < THREAD_COUNT; i++) {
+                PERMITS.acquire();
+                Thread t = new ChunkProcessingThread(i, fc);
+                t.setName(STR."T\{i}");
+                t.start();
+            }
+            do {
+                Thread.sleep(100);
+            } while (PERMITS.availablePermits() != THREAD_COUNT);
+        }
+        System.out.println(globalAggregator.getResult());
+    }
+
+    static class ChunkProcessingThread extends Thread {
+
+        private int index;
+        private final FileChannel fc;
+        private final MeasurementAggregator aggregator;
+
+        ChunkProcessingThread(int index, FileChannel fc) {
+            this.index = index;
+            this.fc = fc;
+            aggregator = new MeasurementAggregator();
+        }
+
+        @Override
+        public void run() {
+            ByteBuffer buffer = ByteBuffer.allocate(index == 0 ? bufferSize : bufferSize + 50);
+            long fcPosition = index == 0 ? 0 : (((long) index * bufferSize) - 50);
+            try {
+                while (fc.read(buffer, fcPosition) != -1) {
+                    buffer.flip();
+                    if (index != 0 /* && fc.position() != bufferSize */) {
+                        seekStartPos(buffer);
+                    }
+                    processBuffer(buffer);
+                    index += THREAD_COUNT;
+                    fcPosition = ((long) index * bufferSize) - 50L;
+                    if (buffer.capacity() == 8192) {
+                        buffer = ByteBuffer.allocate(bufferSize + 50);
+                    }
+                    buffer.position(0);
+                }
+                AGGREGATOR_PERMITS.acquire();
+                globalAggregator.process(aggregator);
+                AGGREGATOR_PERMITS.release();
+            }
+            catch (IOException | InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+            PERMITS.release();
+        }
+
+        private void processBuffer(ByteBuffer buffer) throws IOException {
+            int mStartMark = buffer.position();
+            int tStartMark = -1;
+            int count = buffer.position();
+            do {
+                byte b = buffer.get(count);
+                if (b == SEMICOLON) {
+                    tStartMark = count;
+                }
+                else if (b == NEW_LINE) {
+                    byte[] locArr = new byte[tStartMark - mStartMark];
+                    byte[] tempArr = new byte[count - tStartMark];
+                    buffer.get(mStartMark, locArr);
+                    buffer.get(mStartMark + locArr.length + 1, tempArr);
+                    aggregator.process(locArr, tempArr);
+                    mStartMark = count + 1;
+                }
+                count++;
+            } while (count < buffer.limit());
+        }
+
+        private void seekStartPos(ByteBuffer buffer) {
+            int i = buffer.limit() > 50 ? 49 : buffer.limit() - 2;
+            for (; i >= 0; i--) {
+                if (buffer.get(i) == NEW_LINE) {
+                    buffer.position(i + 1);
+                    break;
+                }
+            }
+        }
+    }
+
+    static final class MeasurementAggregator {
+        private static final long MAX_VALUE_DIVIDE_10 = Long.MAX_VALUE / 10;
+        private final Map<Integer, Measurement> store = new HashMap<>();
+
+        public void process(MeasurementAggregator other) {
+            other.store.forEach((k, v) -> {
+                Measurement m = store.get(k);
+                if (m == null) {
+                    m = new Measurement();
+                    store.put(k, m);
+                }
+                m.process(v);
+            });
+        }
+
+        public void process(byte[] location, byte[] temperature) throws IOException {
+            Integer hashCode = Arrays.hashCode(location);
+            LOCATION_STORE.computeIfAbsent(hashCode, _ -> new String(location));
+            // String loc = new String(location);
+            Measurement measurement = store.get(hashCode);
+            if (measurement == null) {
+                measurement = new Measurement();
+                store.put(hashCode, measurement);
+            }
+            double tempD = parseDouble(temperature);
+            measurement.process(tempD);
+        }
+
+        public double parseDouble(byte[] bytes) {
+            long value = 0;
+            int exp = 0;
+            boolean negative = false;
+            int decimalPlaces = Integer.MIN_VALUE;
+            int index = 0;
+            int ch = bytes[index];
+            if (ch == '-') {
+                negative = true;
+                ch = bytes[++index];
+            }
+            while (index < bytes.length) {
+                if (ch >= '0' && ch <= '9') {
+                    while (value >= MAX_VALUE_DIVIDE_10) {
+                        value >>>= 1;
+                        exp++;
+                    }
+                    value = value * 10 + (ch - '0');
+                    decimalPlaces++;
+
+                }
+                else if (ch == '.') {
+                    decimalPlaces = 0;
+                }
+                if (index == bytes.length - 1) {
+                    break;
+                }
+                else {
+                    ch = bytes[++index];
+                }
+            }
+            return asDouble(value, exp, negative, decimalPlaces);
+        }
+
+        private static double asDouble(long value, int exp, boolean negative, int decimalPlaces) {
+            if (decimalPlaces > 0 && value < Long.MAX_VALUE / 2) {
+                if (value < Long.MAX_VALUE / (1L << 32)) {
+                    exp -= 32;
+                    value <<= 32;
+                }
+                if (value < Long.MAX_VALUE / (1L << 16)) {
+                    exp -= 16;
+                    value <<= 16;
+                }
+                if (value < Long.MAX_VALUE / (1L << 8)) {
+                    exp -= 8;
+                    value <<= 8;
+                }
+                if (value < Long.MAX_VALUE / (1L << 4)) {
+                    exp -= 4;
+                    value <<= 4;
+                }
+                if (value < Long.MAX_VALUE / (1L << 2)) {
+                    exp -= 2;
+                    value <<= 2;
+                }
+                if (value < Long.MAX_VALUE / (1L << 1)) {
+                    exp -= 1;
+                    value <<= 1;
+                }
+            }
+            for (; decimalPlaces > 0; decimalPlaces--) {
+                exp--;
+                long mod = value % 5;
+                value /= 5;
+                int modDiv = 1;
+                if (value < Long.MAX_VALUE / (1L << 4)) {
+                    exp -= 4;
+                    value <<= 4;
+                    modDiv <<= 4;
+                }
+                if (value < Long.MAX_VALUE / (1L << 2)) {
+                    exp -= 2;
+                    value <<= 2;
+                    modDiv <<= 2;
+                }
+                if (value < Long.MAX_VALUE / (1L << 1)) {
+                    exp -= 1;
+                    value <<= 1;
+                    modDiv <<= 1;
+                }
+                if (decimalPlaces > 1)
+                    value += modDiv * mod / 5;
+                else
+                    value += (modDiv * mod + 4) / 5;
+            }
+            final double d = Math.scalb((double) value, exp);
+            return negative ? -d : d;
+        }
+
+        public String getResult() {
+            Map<String, Measurement> sortedMap = new TreeMap<>();
+            store.forEach((k, v) -> sortedMap.put(LOCATION_STORE.get(k), v));
+            return sortedMap.toString();
+        }
+    }
+
+    static final class Measurement {
+        private double min = Double.POSITIVE_INFINITY;
+        private double max = Double.NEGATIVE_INFINITY;
+        private double sum;
+        private long count;
+
+        public void process(double value) {
+            if (value < min) {
+                min = value;
+            }
+            if (value > max) {
+                max = value;
+            }
+            sum += value;
+            count++;
+        }
+
+        public void process(Measurement other) {
+            if (other.min < min) {
+                this.min = other.min;
+            }
+            if (other.max > max) {
+                this.max = other.max;
+            }
+            this.sum += other.sum;
+            this.count += other.count;
+        }
+
+        public String toString() {
+            ResultRow result = new ResultRow(min, sum, count, max);
+            return result.toString();
+        }
+    }
+
+    private record ResultRow(double min, double sum, double count, double max) {
+
+        public String toString() {
+            return STR."\{round(min)}/\{round((Math.round(sum * 10.0) / 10.0) / count)}/\{round(max)}";
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+}

From e639e2a045371ab0be51404767a42f22f689cf2c Mon Sep 17 00:00:00 2001
From: Jamal Mulla <jamaldevacc@gmail.com>
Date: Wed, 31 Jan 2024 21:09:25 +0000
Subject: [PATCH 237/268] Second attempt with various improvements (#510)

* Initial chunked impl

* Bytes instead of chars

* Improved number parsing

* Custom hashmap

* Graal and some tuning

* Fix segmenting

* Fix casing

* Unsafe

* Inlining hash calc

* Improved loop

* Cleanup

* Speeding up equals

* Simplifying hash

* Replace concurrenthashmap with lock

* Small changes

* Script reorg

* Native

* Lots of inlining and improvements

* Add back length check

* Fixes

* Small changes

---------

Co-authored-by: Jamal Mulla <j.mulla@mwam.com>
---
 calculate_average_JamalMulla.sh               |  10 +-
 prepare_JamalMulla.sh                         |   8 +-
 .../onebrc/CalculateAverage_JamalMulla.java   | 346 ++++++++----------
 3 files changed, 176 insertions(+), 188 deletions(-)

diff --git a/calculate_average_JamalMulla.sh b/calculate_average_JamalMulla.sh
index 228d56bfb..119263bad 100755
--- a/calculate_average_JamalMulla.sh
+++ b/calculate_average_JamalMulla.sh
@@ -15,5 +15,11 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -XX:+UseTransparentHugePages"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_JamalMulla
+
+
+if [ -f target/CalculateAverage_JamalMulla_image ]; then
+    target/CalculateAverage_JamalMulla_image
+else
+    JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -XX:+UseTransparentHugePages -XX:-TieredCompilation"
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_JamalMulla
+fi
\ No newline at end of file
diff --git a/prepare_JamalMulla.sh b/prepare_JamalMulla.sh
index ec0f35f1c..d950d43ce 100755
--- a/prepare_JamalMulla.sh
+++ b/prepare_JamalMulla.sh
@@ -16,4 +16,10 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
\ No newline at end of file
+sdk use java 21.0.2-graal 1>&2
+
+# ./mvnw clean verify removes target/ and will re-trigger native image creation.
+if [ ! -f target/CalculateAverage_JamalMulla_image ]; then
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -march=native --enable-preview --strict-image-heap --link-at-build-time -R:MaxHeapSize=64m -da -dsa --no-fallback --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_JamalMulla"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_JamalMulla_image dev.morling.onebrc.CalculateAverage_JamalMulla
+fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java
index 770588556..7daf1997f 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java
@@ -21,21 +21,32 @@
 import java.io.RandomAccessFile;
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
-import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
-import java.util.*;
+import java.util.Map;
+import java.util.TreeMap;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 
 public class CalculateAverage_JamalMulla {
 
-    private static final Map<String, ResultRow> global = new HashMap<>();
+    private static final long ALL_SEMIS = 0x3B3B3B3B3B3B3B3BL;
+    private static final Map<String, ResultRow> global = new TreeMap<>();
     private static final String FILE = "./measurements.txt";
     private static final Unsafe UNSAFE = initUnsafe();
     private static final Lock lock = new ReentrantLock();
-    private static final int FNV_32_INIT = 0x811c9dc5;
-    private static final int FNV_32_PRIME = 0x01000193;
+    private static final long FXSEED = 0x517cc1b727220a95L;
+
+    private static final long[] masks = {
+            0x0,
+            0x00000000000000FFL,
+            0x000000000000FFFFL,
+            0x0000000000FFFFFFL,
+            0x00000000FFFFFFFFL,
+            0x000000FFFFFFFFFFL,
+            0x0000FFFFFFFFFFFFL,
+            0x00FFFFFFFFFFFFFFL
+    };
 
     private static Unsafe initUnsafe() {
         try {
@@ -53,12 +64,16 @@ private static final class ResultRow {
         private int max;
         private long sum;
         private int count;
+        private final long keyStart;
+        private final byte keyLength;
 
-        private ResultRow(int v) {
+        private ResultRow(int v, final long keyStart, final byte keyLength) {
             this.min = v;
             this.max = v;
             this.sum = v;
             this.count = 1;
+            this.keyStart = keyStart;
+            this.keyLength = keyLength;
         }
 
         public String toString() {
@@ -68,236 +83,197 @@ public String toString() {
         private double round(double value) {
             return Math.round(value) / 10.0;
         }
+
     }
 
     private record Chunk(Long start, Long length) {
     }
 
-    static List<Chunk> getChunks(int numThreads, FileChannel channel) throws IOException {
+    static Chunk[] getChunks(int numThreads, FileChannel channel) throws IOException {
         // get all chunk boundaries
         final long filebytes = channel.size();
         final long roughChunkSize = filebytes / numThreads;
-        final List<Chunk> chunks = new ArrayList<>(numThreads);
+        final Chunk[] chunks = new Chunk[numThreads];
         final long mappedAddress = channel.map(FileChannel.MapMode.READ_ONLY, 0, filebytes, Arena.global()).address();
         long chunkStart = 0;
         long chunkLength = Math.min(filebytes - chunkStart - 1, roughChunkSize);
+        int i = 0;
         while (chunkStart < filebytes) {
-            // unlikely we need to read more than this many bytes to find the next newline
-            MappedByteBuffer mbb = channel.map(FileChannel.MapMode.READ_ONLY, chunkStart + chunkLength,
-                    Math.min(Math.min(filebytes - chunkStart - chunkLength, chunkLength), 100));
-
-            while (mbb.get() != 0xA /* \n */) {
+            while (UNSAFE.getByte(mappedAddress + chunkStart + chunkLength) != 0xA /* \n */) {
                 chunkLength++;
             }
 
-            chunks.add(new Chunk(mappedAddress + chunkStart, chunkLength + 1));
+            chunks[i++] = new Chunk(mappedAddress + chunkStart, chunkLength + 1);
             // to skip the nl in the next chunk
             chunkStart += chunkLength + 1;
             chunkLength = Math.min(filebytes - chunkStart - 1, roughChunkSize);
         }
+
         return chunks;
     }
 
-    private static class CalculateTask implements Runnable {
+    private static void run(Chunk chunk) {
 
-        private final SimplerHashMap results;
-        private final Chunk chunk;
+        // can't have more than 10000 unique keys but want to match max hash
+        final int MAPSIZE = 65536;
+        final ResultRow[] slots = new ResultRow[MAPSIZE];
 
-        public CalculateTask(Chunk chunk) {
-            this.results = new SimplerHashMap();
-            this.chunk = chunk;
-        }
+        byte nameLength;
+        int temp;
+        long hash;
+
+        long i = chunk.start;
+        final long cl = chunk.start + chunk.length;
+        long word;
+        long hs;
+        long start;
+        byte c;
+        int slot;
+        long n;
+        ResultRow slotValue;
+
+        while (i < cl) {
+            start = i;
+            hash = 0;
+
+            word = UNSAFE.getLong(i);
+
+            while (true) {
+                n = word ^ ALL_SEMIS;
+                hs = (n - 0x0101010101010101L) & (~n & 0x8080808080808080L);
+                if (hs != 0)
+                    break;
+                hash = (hash ^ word) * FXSEED;
+                i += 8;
+                word = UNSAFE.getLong(i);
+            }
 
-        @Override
-        public void run() {
-            // no names bigger than this
-            final byte[] nameBytes = new byte[100];
-            short nameIndex = 0;
-            int ot;
-            // fnv hash
-            int hash = FNV_32_INIT;
-
-            long i = chunk.start;
-            final long cl = chunk.start + chunk.length;
-            while (i < cl) {
-                byte c;
-                while ((c = UNSAFE.getByte(i++)) != 0x3B /* semi-colon */) {
-                    nameBytes[nameIndex++] = c;
-                    hash ^= c;
-                    hash *= FNV_32_PRIME;
+            i += Long.numberOfTrailingZeros(hs) >> 3;
+
+            // hash of what's left ((hs >>> 7) - 1) masks off the bytes from word that are before the semicolon
+            hash = (hash ^ word & (hs >>> 7) - 1) * FXSEED;
+            nameLength = (byte) (i++ - start);
+
+            // temperature value follows
+            c = UNSAFE.getByte(i++);
+            // we know the val has to be between -99.9 and 99.8
+            // always with a single fractional digit
+            // represented as a byte array of either 4 or 5 characters
+            if (c != 0x2D /* minus sign */) {
+                // could be either n.x or nn.x
+                if (UNSAFE.getByte(i + 2) == 0xA) {
+                    temp = (c - 48) * 10; // char 1
                 }
-
-                // temperature value follows
-                c = UNSAFE.getByte(i++);
-                // we know the val has to be between -99.9 and 99.8
-                // always with a single fractional digit
-                // represented as a byte array of either 4 or 5 characters
-                if (c == 0x2D /* minus sign */) {
-                    // could be either n.x or nn.x
-                    if (UNSAFE.getByte(i + 3) == 0xA) {
-                        ot = (UNSAFE.getByte(i++) - 48) * 10; // char 1
-                    }
-                    else {
-                        ot = (UNSAFE.getByte(i++) - 48) * 100; // char 1
-                        ot += (UNSAFE.getByte(i++) - 48) * 10; // char 2
-                    }
-                    i++; // skip dot
-                    ot += (UNSAFE.getByte(i++) - 48); // char 2
-                    ot = -ot;
+                else {
+                    temp = (c - 48) * 100; // char 1
+                    temp += (UNSAFE.getByte(i++) - 48) * 10; // char 2
+                }
+                temp += (UNSAFE.getByte(++i) - 48); // char 3
+            }
+            else {
+                // could be either n.x or nn.x
+                if (UNSAFE.getByte(i + 3) == 0xA) {
+                    temp = (UNSAFE.getByte(i) - 48) * 10; // char 1
+                    i += 2;
                 }
                 else {
-                    // could be either n.x or nn.x
-                    if (UNSAFE.getByte(i + 2) == 0xA) {
-                        ot = (c - 48) * 10; // char 1
-                    }
-                    else {
-                        ot = (c - 48) * 100; // char 1
-                        ot += (UNSAFE.getByte(i++) - 48) * 10; // char 2
-                    }
-                    i++; // skip dot
-                    ot += (UNSAFE.getByte(i++) - 48); // char 3
+                    temp = (UNSAFE.getByte(i) - 48) * 100; // char 1
+                    temp += (UNSAFE.getByte(i + 1) - 48) * 10; // char 2
+                    i += 3;
+                }
+                temp += (UNSAFE.getByte(i) - 48); // char 2
+                temp = -temp;
+            }
+            i += 2;
+
+            // xor folding
+            slot = (int) (hash ^ hash >> 32) & 65535;
+
+            // Linear probe for open slot
+            while ((slotValue = slots[slot]) != null && (slotValue.keyLength != nameLength || !unsafeEquals(slotValue.keyStart, start, nameLength))) {
+                slot = (slot + 1) % MAPSIZE;
+            }
+
+            // existing
+            if (slotValue != null) {
+                slotValue.sum += temp;
+                slotValue.count++;
+                if (temp > slotValue.max) {
+                    slotValue.max = temp;
+                    continue;
                 }
+                if (temp < slotValue.min)
+                    slotValue.min = temp;
 
-                i++;// nl
-                hash &= 65535;
-                results.putOrMerge(nameBytes, nameIndex, hash, ot);
-                // reset
-                nameIndex = 0;
-                hash = 0x811c9dc5;
             }
+            else {
+                // new value
+                slots[slot] = new ResultRow(temp, start, nameLength);
+            }
+        }
 
-            // merge results with overall results
-            List<MapEntry> all = results.getAll();
-            lock.lock();
-            try {
-                for (MapEntry me : all) {
-                    ResultRow rr;
-                    ResultRow lr = me.row;
-                    if ((rr = global.get(me.key)) != null) {
-                        rr.min = Math.min(rr.min, lr.min);
-                        rr.max = Math.max(rr.max, lr.max);
-                        rr.count += lr.count;
-                        rr.sum += lr.sum;
+        // merge results with overall results
+        ResultRow rr;
+        String key;
+        byte[] bytes;
+        lock.lock();
+        try {
+            for (ResultRow resultRow : slots) {
+                if (resultRow != null) {
+                    bytes = new byte[resultRow.keyLength];
+                    // copy the name bytes
+                    UNSAFE.copyMemory(null, resultRow.keyStart, bytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, resultRow.keyLength);
+                    key = new String(bytes, StandardCharsets.UTF_8);
+                    if ((rr = global.get(key)) != null) {
+                        rr.min = Math.min(rr.min, resultRow.min);
+                        rr.max = Math.max(rr.max, resultRow.max);
+                        rr.count += resultRow.count;
+                        rr.sum += resultRow.sum;
                     }
                     else {
-                        global.put(me.key, lr);
+                        global.put(key, resultRow);
                     }
                 }
             }
-            finally {
-                lock.unlock();
+        }
+        finally {
+            lock.unlock();
+        }
+
+    }
+
+    static boolean unsafeEquals(final long a_address, final long b_address, final byte b_length) {
+        // byte by byte comparisons are slow, so do as big chunks as possible
+        byte i = 0;
+        for (; i < (b_length & -8); i += 8) {
+            if (UNSAFE.getLong(a_address + i) != UNSAFE.getLong(b_address + i)) {
+                return false;
             }
         }
+        if (i == b_length)
+            return true;
+        return (UNSAFE.getLong(a_address + i) & masks[b_length - i]) == (UNSAFE.getLong(b_address + i) & masks[b_length - i]);
     }
 
     public static void main(String[] args) throws IOException, InterruptedException {
-        FileChannel channel = new RandomAccessFile(FILE, "r").getChannel();
         int numThreads = 1;
+        FileChannel channel = new RandomAccessFile(FILE, "r").getChannel();
         if (channel.size() > 64000) {
             numThreads = Runtime.getRuntime().availableProcessors();
         }
-        List<Chunk> chunks = getChunks(numThreads, channel);
-        List<Thread> threads = new ArrayList<>();
-        for (Chunk chunk : chunks) {
-            Thread thread = new Thread(new CalculateTask(chunk));
+        Chunk[] chunks = getChunks(numThreads, channel);
+        Thread[] threads = new Thread[chunks.length];
+        for (int i = 0; i < chunks.length; i++) {
+            int finalI = i;
+            Thread thread = new Thread(() -> run(chunks[finalI]));
             thread.setPriority(Thread.MAX_PRIORITY);
             thread.start();
-            threads.add(thread);
+            threads[i] = thread;
         }
         for (Thread t : threads) {
             t.join();
         }
-        // create treemap just to sort
-        System.out.println(new TreeMap<>(global));
+        System.out.println(global);
+        channel.close();
     }
-
-    record MapEntry(String key, ResultRow row) {
-    }
-
-    static class SimplerHashMap {
-        // can't have more than 10000 unique keys but want to match max hash
-        final int MAPSIZE = 65536;
-        final ResultRow[] slots = new ResultRow[MAPSIZE];
-        final byte[][] keys = new byte[MAPSIZE][];
-
-        public void putOrMerge(final byte[] key, final short length, final int hash, final int temp) {
-            int slot = hash;
-            ResultRow slotValue;
-
-            // Linear probe for open slot
-            while ((slotValue = slots[slot]) != null && (keys[slot].length != length || !unsafeEquals(keys[slot], key, length))) {
-                slot++;
-            }
-
-            // existing
-            if (slotValue != null) {
-                slotValue.min = Math.min(slotValue.min, temp);
-                slotValue.max = Math.max(slotValue.max, temp);
-                slotValue.sum += temp;
-                slotValue.count++;
-                return;
-            }
-
-            // new value
-            slots[slot] = new ResultRow(temp);
-            byte[] bytes = new byte[length];
-            System.arraycopy(key, 0, bytes, 0, length);
-            keys[slot] = bytes;
-        }
-
-        static boolean unsafeEquals(final byte[] a, final byte[] b, final short length) {
-            // byte by byte comparisons are slow, so do as big chunks as possible
-            final int baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET;
-
-            short i = 0;
-            // round down to nearest power of 8
-            for (; i < (length & -8); i += 8) {
-                if (UNSAFE.getLong(a, i + baseOffset) != UNSAFE.getLong(b, i + baseOffset)) {
-                    return false;
-                }
-            }
-            if (i == length) {
-                return true;
-            }
-            // leftover ints
-            for (; i < (length - i & -4); i += 4) {
-                if (UNSAFE.getInt(a, i + baseOffset) != UNSAFE.getInt(b, i + baseOffset)) {
-                    return false;
-                }
-            }
-            if (i == length) {
-                return true;
-            }
-            // leftover shorts
-            for (; i < (length - i & -2); i += 2) {
-                if (UNSAFE.getShort(a, i + baseOffset) != UNSAFE.getShort(b, i + baseOffset)) {
-                    return false;
-                }
-            }
-            if (i == length) {
-                return true;
-            }
-            // leftover bytes
-            for (; i < (length - i); i++) {
-                if (UNSAFE.getByte(a, i + baseOffset) != UNSAFE.getByte(b, i + baseOffset)) {
-                    return false;
-                }
-            }
-
-            return true;
-        }
-
-        // Get all pairs
-        public List<MapEntry> getAll() {
-            final List<MapEntry> result = new ArrayList<>(slots.length);
-            for (int i = 0; i < slots.length; i++) {
-                ResultRow slotValue = slots[i];
-                if (slotValue != null) {
-                    result.add(new MapEntry(new String(keys[i], StandardCharsets.UTF_8), slotValue));
-                }
-            }
-            return result;
-        }
-    }
-
 }

From a11e5a12470c8c5edad09622f3d23b660853798b Mon Sep 17 00:00:00 2001
From: Elliot Barlas <elliotbarlas@gmail.com>
Date: Wed, 31 Jan 2024 13:12:35 -0800
Subject: [PATCH 238/268] Use GraalVM native image for ebarlas submission.
 (#698)

---
 calculate_average_ebarlas.sh | 9 +++++++--
 prepare_ebarlas.sh           | 7 ++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/calculate_average_ebarlas.sh b/calculate_average_ebarlas.sh
index 2bd59d4ba..c73cb1abf 100755
--- a/calculate_average_ebarlas.sh
+++ b/calculate_average_ebarlas.sh
@@ -15,5 +15,10 @@
 #  limitations under the License.
 #
 
-JAVA_OPTS="--enable-preview"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ebarlas
+if [ -f target/CalculateAverage_ebarlas_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_ebarlas_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_ebarlas_image
+else
+    echo "Choosing to run the app in JVM mode as no native image was found, use prepare_ebarlas.sh to generate." 1>&2
+    java --enable-preview --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_ebarlas
+fi
diff --git a/prepare_ebarlas.sh b/prepare_ebarlas.sh
index f83a3ff69..64b2bea81 100755
--- a/prepare_ebarlas.sh
+++ b/prepare_ebarlas.sh
@@ -16,4 +16,9 @@
 #
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk use java 21.0.1-graal 1>&2
+sdk use java 21.0.2-graal 1>&2
+
+if [ ! -f target/CalculateAverage_ebarlas_image ]; then
+    NATIVE_IMAGE_OPTS="-H:+UnlockExperimentalVMOptions --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_ebarlas --gc=epsilon -O3 -march=native -R:MaxHeapSize=128m -H:-GenLoopSafepoints --enable-preview"
+    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_ebarlas_image dev.morling.onebrc.CalculateAverage_ebarlas
+fi

From 540ef2c8633994c62354d10d013a25048f79932c Mon Sep 17 00:00:00 2001
From: gonix <d.giedrius+github@gmail.com>
Date: Wed, 31 Jan 2024 23:17:08 +0200
Subject: [PATCH 239/268] CalculateAverage_gonixunsafe: an attempt in the
 unsafe category (#695)

Co-authored-by: Giedrius D <d.giedrius@gmail.com>
---
 calculate_average_gonixunsafe.sh              |  31 +
 .../onebrc/CalculateAverage_gonixunsafe.java  | 553 ++++++++++++++++++
 2 files changed, 584 insertions(+)
 create mode 100755 calculate_average_gonixunsafe.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java

diff --git a/calculate_average_gonixunsafe.sh b/calculate_average_gonixunsafe.sh
new file mode 100755
index 000000000..24bee2797
--- /dev/null
+++ b/calculate_average_gonixunsafe.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+JAVA_OPTS="--enable-preview"
+# Copied from @serkan-ozal
+# Unsure if it helps (maybe something within ~10ms),
+# but at least it doesn't seem to make anything worse.
+JAVA_OPTS="$JAVA_OPTS -XX:+UnlockExperimentalVMOptions -XX:+UnlockDiagnosticVMOptions"
+JAVA_OPTS="$JAVA_OPTS -XX:-TieredCompilation -XX:MaxInlineSize=10000 -XX:InlineSmallCode=10000 -XX:FreqInlineSize=10000"
+JAVA_OPTS="$JAVA_OPTS -XX:-UseCountedLoopSafepoints -XX:GuaranteedSafepointInterval=0"
+JAVA_OPTS="$JAVA_OPTS -XX:+TrustFinalNonStaticFields -da -dsa -XX:+UseNUMA -XX:-EnableJVMCI"
+if [[ ! "$(uname -s)" = "Darwin" ]]; then
+  JAVA_OPTS="$JAVA_OPTS -XX:+UseTransparentHugePages"
+fi
+
+exec cat < <(exec java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gonixunsafe)
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java
new file mode 100644
index 000000000..bf75389a9
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java
@@ -0,0 +1,553 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
+
+import sun.misc.Unsafe;
+
+public class CalculateAverage_gonixunsafe {
+
+    private static final String FILE = "./measurements.txt";
+    private static final int MAX_THREADS = Runtime.getRuntime().availableProcessors();
+
+    public static void main(String[] args) throws Exception {
+
+        var file = new RandomAccessFile(FILE, "r");
+
+        var chunks = Aggregator.buildChunks(file, MAX_THREADS);
+        var chunksCount = chunks.size();
+        var threads = new Thread[chunksCount];
+        var result = new AtomicReference<Aggregator>();
+        for (int i = 0; i < chunksCount; ++i) {
+            var agg = new Aggregator();
+            var chunk = chunks.get(i);
+            var thread = new Thread(() -> {
+                agg.processChunk(chunk);
+                while (!result.compareAndSet(null, agg)) {
+                    Aggregator other = result.getAndSet(null);
+                    if (other != null) {
+                        agg.merge(other);
+                    }
+                }
+            });
+            thread.start();
+            threads[i] = thread;
+        }
+        for (int i = 0; i < chunksCount; ++i) {
+            threads[i].join();
+        }
+        System.out.println(result.get().toString());
+        System.out.close();
+    }
+
+    private static class Aggregator {
+        private static final int MAX_STATIONS = 10_000;
+        private static final int INDEX_SIZE = 256 * 1024 * 8;
+        private static final int INDEX_MASK = (INDEX_SIZE - 1) & ~7;
+
+        private static final int HEADER_SIZE = 8;
+        private static final int MAX_KEY_SIZE = 100;
+        private static final int FLD_COUNT = 0; // long
+        private static final int FLD_SUM = 8; // long
+        private static final int FLD_MIN = 16; // int
+        private static final int FLD_MAX = 20; // int
+        private static final int FLD_HASH = 24; // int
+        private static final int FIELDS_SIZE = 28 + 4; // +padding to align to 8 bytes
+        private static final int MAX_STATION_SIZE = HEADER_SIZE + MAX_KEY_SIZE + FIELDS_SIZE;
+
+        private static final Unsafe UNSAFE;
+
+        static {
+            try {
+                Field unsafe = Unsafe.class.getDeclaredField("theUnsafe");
+                unsafe.setAccessible(true);
+                UNSAFE = (Unsafe) unsafe.get(Unsafe.class);
+            }
+            catch (Throwable e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        private static long alloc(long size) {
+            long addr = UNSAFE.allocateMemory(size);
+            UNSAFE.setMemory(addr, size, (byte) 0);
+            return addr;
+        }
+
+        // Poor man's hash map: hash code to offset in `mem`.
+        private final long indexAddr = alloc(INDEX_SIZE);
+
+        // Contiguous storage of key (station name) and stats fields of all
+        // unique stations.
+        // The idea here is to improve locality so that stats fields would
+        // possibly be already in the CPU cache after we are done comparing
+        // the key.
+        private final long memAddr = alloc(MAX_STATIONS * MAX_STATION_SIZE);
+        private long memUsed = memAddr;
+        private int count = 0;
+
+        static List<Chunk> buildChunks(RandomAccessFile file, int count) throws IOException {
+            var fileSize = file.length();
+            var chunkSize = Math.min(Integer.MAX_VALUE - 512, fileSize / count);
+            if (chunkSize <= 0) {
+                chunkSize = fileSize;
+            }
+            var chunks = new ArrayList<Chunk>((int) (fileSize / chunkSize) + 1);
+            var mmap = file.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global());
+            var fileStartAddr = mmap.address();
+            var fileEndAddr = mmap.address() + mmap.byteSize();
+            var chunkStartAddr = fileStartAddr;
+            while (chunkStartAddr < fileEndAddr) {
+                var pos = chunkStartAddr + chunkSize;
+                if (pos < fileEndAddr) {
+                    while (UNSAFE.getByte(pos) != '\n') {
+                        pos += 1;
+                    }
+                    pos += 1;
+                }
+                else {
+                    pos = fileEndAddr;
+                }
+                chunks.add(new Chunk(mmap, chunkStartAddr, pos, fileStartAddr, fileEndAddr));
+                chunkStartAddr = pos;
+            }
+            return chunks;
+        }
+
+        Aggregator processChunk(Chunk chunk) {
+            // As an optimization, we assume that we can read past the end
+            // of file size if as we don't cross page boundary.
+            final int WANT_PADDING = 8;
+            final int PAGE_SIZE = UNSAFE.pageSize();
+            if (((chunk.chunkEndAddr + WANT_PADDING) / PAGE_SIZE) <= (chunk.fileEndAddr / PAGE_SIZE)) {
+                return processChunk(chunk.chunkStartAddr, chunk.chunkEndAddr);
+            }
+
+            // Otherwise, to avoid checking if it is safe to read a whole long
+            // near the end of a chunk, we copy the last couple of lines to a
+            // padded buffer and process that part separately.
+            long pos = Math.max(-1, chunk.chunkEndAddr - WANT_PADDING - 1);
+            while (pos >= 0 && UNSAFE.getByte(pos) != '\n') {
+                pos--;
+            }
+            pos++;
+            if (pos > 0) {
+                processChunk(chunk.chunkStartAddr, pos);
+            }
+            long tailLen = chunk.chunkEndAddr - pos;
+            var tailAddr = alloc(tailLen + WANT_PADDING);
+            UNSAFE.copyMemory(pos, tailAddr, tailLen);
+            processChunk(tailAddr, tailAddr + tailLen);
+            return this;
+        }
+
+        private Aggregator processChunk(long startAddr, long endAddr) {
+            long pos = startAddr;
+            while (pos < endAddr) {
+
+                long start = pos;
+                long keyLong = UNSAFE.getLong(pos);
+                long valueSepMark = valueSepMark(keyLong);
+                if (valueSepMark != 0) {
+                    int tailBits = tailBits(valueSepMark);
+                    pos += valueOffset(tailBits);
+                    // assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (1), pos=" + (pos - startAddr);
+                    long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
+
+                    long valueLong = UNSAFE.getLong(pos);
+                    int decimalSepMark = decimalSepMark(valueLong);
+                    pos += nextKeyOffset(decimalSepMark);
+                    // assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (1), pos=" + (pos - startAddr);
+                    int measurement = decimalValue(decimalSepMark, valueLong);
+
+                    add1(start, tailAndLen, hash(hash1(tailAndLen)), measurement);
+                    continue;
+                }
+
+                pos += 8;
+                long keyLong1 = keyLong;
+                keyLong = UNSAFE.getLong(pos);
+                valueSepMark = valueSepMark(keyLong);
+                if (valueSepMark != 0) {
+                    int tailBits = tailBits(valueSepMark);
+                    pos += valueOffset(tailBits);
+                    // assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (2), pos=" + (pos - startAddr);
+                    long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
+
+                    long valueLong = UNSAFE.getLong(pos);
+                    int decimalSepMark = decimalSepMark(valueLong);
+                    pos += nextKeyOffset(decimalSepMark);
+                    // assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (2), pos=" + (pos - startAddr);
+                    int measurement = decimalValue(decimalSepMark, valueLong);
+
+                    add2(start, keyLong1, tailAndLen, hash(hash(hash1(keyLong1), tailAndLen)), measurement);
+                    continue;
+                }
+
+                long hash = hash1(keyLong1);
+                do {
+                    pos += 8;
+                    hash = hash(hash, keyLong);
+                    keyLong = UNSAFE.getLong(pos);
+                    valueSepMark = valueSepMark(keyLong);
+                } while (valueSepMark == 0);
+                int tailBits = tailBits(valueSepMark);
+                pos += valueOffset(tailBits);
+                // assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (N), pos=" + (pos - startAddr);
+                long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
+                hash = hash(hash, tailAndLen);
+
+                long valueLong = UNSAFE.getLong(pos);
+                int decimalSepMark = decimalSepMark(valueLong);
+                pos += nextKeyOffset(decimalSepMark);
+                // assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (N), pos=" + (pos - startAddr);
+                int measurement = decimalValue(decimalSepMark, valueLong);
+
+                addN(start, tailAndLen, hash(hash), measurement);
+            }
+
+            return this;
+        }
+
+        private static long hash1(long value) {
+            return value;
+        }
+
+        private static long hash(long hash, long value) {
+            return hash ^ value;
+        }
+
+        private static int hash(long hash) {
+            hash *= 0x9E3779B97F4A7C15L; // Fibonacci hashing multiplier
+            return (int) (hash >>> 39);
+        }
+
+        private static long valueSepMark(long keyLong) {
+            // Seen this trick used in multiple other solutions.
+            // Nice breakdown here: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+            long match = keyLong ^ 0x3B3B3B3B_3B3B3B3BL; // 3B == ';'
+            match = (match - 0x01010101_01010101L) & (~match & 0x80808080_80808080L);
+            return match;
+        }
+
+        private static int tailBits(long valueSepMark) {
+            return Long.numberOfTrailingZeros(valueSepMark >>> 7);
+        }
+
+        private static int valueOffset(int tailBits) {
+            return (int) (tailBits >>> 3) + 1;
+        }
+
+        private static long tailAndLen(int tailBits, long keyLong, long keyLen) {
+            long tailMask = ~(-1L << tailBits);
+            long tail = keyLong & tailMask;
+            return (tail << 8) | (keyLen & 0xFF);
+        }
+
+        private static int decimalSepMark(long value) {
+            // Seen this trick used in multiple other solutions.
+            // Looks like the original author is @merykitty.
+
+            // The 4th binary digit of the ascii of a digit is 1 while
+            // that of the '.' is 0. This finds the decimal separator
+            // The value can be 12, 20, 28
+            return Long.numberOfTrailingZeros(~value & 0x10101000);
+        }
+
+        private static int decimalValue(int decimalSepMark, long value) {
+            // Seen this trick used in multiple other solutions.
+            // Looks like the original author is @merykitty.
+
+            int shift = 28 - decimalSepMark;
+            // signed is -1 if negative, 0 otherwise
+            long signed = (~value << 59) >> 63;
+            long designMask = ~(signed & 0xFF);
+            // Align the number to a specific position and transform the ascii code
+            // to actual digit value in each byte
+            long digits = ((value & designMask) << shift) & 0x0F000F0F00L;
+
+            // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+            // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+            // 0x000000UU00TTHH00 +
+            // 0x00UU00TTHH000000 * 10 +
+            // 0xUU00TTHH00000000 * 100
+            // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
+            // This results in our value lies in the bit 32 to 41 of this product
+            // That was close :)
+            long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            return (int) ((absValue ^ signed) - signed);
+        }
+
+        private static int nextKeyOffset(int decimalSepMark) {
+            return (decimalSepMark >>> 3) + 3;
+        }
+
+        private void add1(long keyStartAddr, long tailAndLen, int hash, int measurement) {
+            int idx = hash & INDEX_MASK;
+            for (long entryAddr; (entryAddr = UNSAFE.getLong(indexAddr + idx)) != 0; idx = (idx + 8) & INDEX_MASK) {
+                if (update1(entryAddr, tailAndLen, measurement)) {
+                    return;
+                }
+            }
+            UNSAFE.putLong(indexAddr + idx, create(keyStartAddr, tailAndLen, hash, measurement, '1'));
+        }
+
+        private void add2(long keyStartAddr, long keyLong, long tailAndLen, int hash, int measurement) {
+            int idx = hash & INDEX_MASK;
+            for (long entryAddr; (entryAddr = UNSAFE.getLong(indexAddr + idx)) != 0; idx = (idx + 8) & INDEX_MASK) {
+                if (update2(entryAddr, keyLong, tailAndLen, measurement)) {
+                    return;
+                }
+            }
+            UNSAFE.putLong(indexAddr + idx, create(keyStartAddr, tailAndLen, hash, measurement, '2'));
+        }
+
+        private void addN(long keyStartAddr, long tailAndLen, int hash, int measurement) {
+            int idx = hash & INDEX_MASK;
+            for (long entryAddr; (entryAddr = UNSAFE.getLong(indexAddr + idx)) != 0; idx = (idx + 8) & INDEX_MASK) {
+                if (updateN(entryAddr, keyStartAddr, tailAndLen, measurement)) {
+                    return;
+                }
+            }
+            UNSAFE.putLong(indexAddr + idx, create(keyStartAddr, tailAndLen, hash, measurement, 'N'));
+        }
+
+        private long create(long keyStartAddr, long tailAndLen, int hash, int measurement, char _origin) {
+            // assert (memUsed + MAX_STATION_SIZE < memAddr + MAX_STATION_SIZE * MAX_STATIONS) : "Too many stations";
+
+            final long entryAddr = memUsed;
+
+            int keySize = (int) (tailAndLen & 0xF8);
+            long fieldsAddr = entryAddr + HEADER_SIZE + keySize;
+            memUsed += HEADER_SIZE + keySize + FIELDS_SIZE;
+            count++;
+
+            UNSAFE.putLong(entryAddr, tailAndLen);
+            UNSAFE.copyMemory(keyStartAddr, entryAddr + HEADER_SIZE, keySize);
+            UNSAFE.putLong(fieldsAddr + FLD_COUNT, 1);
+            UNSAFE.putLong(fieldsAddr + FLD_SUM, measurement);
+            UNSAFE.putInt(fieldsAddr + FLD_MIN, measurement);
+            UNSAFE.putInt(fieldsAddr + FLD_MAX, measurement);
+            UNSAFE.putInt(fieldsAddr + FLD_HASH, hash);
+
+            return entryAddr;
+        }
+
+        private static boolean update1(long entryAddr, long tailAndLen, int measurement) {
+            if (UNSAFE.getLong(entryAddr) != tailAndLen) {
+                return false;
+            }
+
+            updateStats(entryAddr + HEADER_SIZE, measurement);
+            return true;
+        }
+
+        private static boolean update2(long entryAddr, long keyLong, long tailAndLen, int measurement) {
+            if (UNSAFE.getLong(entryAddr) != tailAndLen) {
+                return false;
+            }
+            if (UNSAFE.getLong(entryAddr + 8) != keyLong) {
+                return false;
+            }
+
+            updateStats(entryAddr + HEADER_SIZE + 8, measurement);
+            return true;
+        }
+
+        private static boolean updateN(long entryAddr, long keyStartAddr, long tailAndLen, int measurement) {
+            if (UNSAFE.getLong(entryAddr) != tailAndLen) {
+                return false;
+            }
+            long memPos = entryAddr + HEADER_SIZE;
+            long memEnd = memPos + ((int) (tailAndLen & 0xF8));
+            long bufPos = keyStartAddr;
+            while (memPos != memEnd) {
+                if (UNSAFE.getLong(memPos) != UNSAFE.getLong(bufPos)) {
+                    return false;
+                }
+                memPos += 8;
+                bufPos += 8;
+            }
+
+            updateStats(memPos, measurement);
+            return true;
+        }
+
+        private static void updateStats(long addr, int measurement) {
+            long oldCount = UNSAFE.getLong(addr + FLD_COUNT);
+            long oldSum = UNSAFE.getLong(addr + FLD_SUM);
+            long oldMin = UNSAFE.getInt(addr + FLD_MIN);
+            long oldMax = UNSAFE.getInt(addr + FLD_MAX);
+
+            UNSAFE.putLong(addr + FLD_COUNT, oldCount + 1);
+            UNSAFE.putLong(addr + FLD_SUM, oldSum + measurement);
+            if (measurement < oldMin) {
+                UNSAFE.putInt(addr + FLD_MIN, measurement);
+            }
+            if (measurement > oldMax) {
+                UNSAFE.putInt(addr + FLD_MAX, measurement);
+            }
+        }
+
+        private static void updateStats(long addr, long count, long sum, int min, int max) {
+            long oldCount = UNSAFE.getLong(addr + FLD_COUNT);
+            long oldSum = UNSAFE.getLong(addr + FLD_SUM);
+            long oldMin = UNSAFE.getInt(addr + FLD_MIN);
+            long oldMax = UNSAFE.getInt(addr + FLD_MAX);
+
+            UNSAFE.putLong(addr + FLD_COUNT, oldCount + count);
+            UNSAFE.putLong(addr + FLD_SUM, oldSum + sum);
+            if (min < oldMin) {
+                UNSAFE.putInt(addr + FLD_MIN, min);
+            }
+            if (max > oldMax) {
+                UNSAFE.putInt(addr + FLD_MAX, max);
+            }
+        }
+
+        public Aggregator merge(Aggregator other) {
+            var otherMemPos = other.memAddr;
+            var otherMemEnd = other.memUsed;
+            merge: for (long entrySize; otherMemPos < otherMemEnd; otherMemPos += entrySize) {
+                int keySize = (int) (UNSAFE.getLong(otherMemPos) & 0xF8);
+                long otherKeyEnd = otherMemPos + HEADER_SIZE + keySize;
+                entrySize = HEADER_SIZE + keySize + FIELDS_SIZE;
+                int hash = UNSAFE.getInt(otherKeyEnd + FLD_HASH);
+                int idx = hash & INDEX_MASK;
+                search: for (long entryAddr; (entryAddr = UNSAFE.getLong(indexAddr + idx)) != 0; idx = (idx + 8) & INDEX_MASK) {
+                    var thisPos = entryAddr;
+                    var otherPos = otherMemPos;
+                    while (otherPos < otherKeyEnd) {
+                        if (UNSAFE.getLong(thisPos) != UNSAFE.getLong(otherPos)) {
+                            continue search;
+                        }
+                        thisPos += 8;
+                        otherPos += 8;
+                    }
+                    updateStats(
+                            thisPos,
+                            UNSAFE.getLong(otherPos + FLD_COUNT),
+                            UNSAFE.getLong(otherPos + FLD_SUM),
+                            UNSAFE.getInt(otherPos + FLD_MIN),
+                            UNSAFE.getInt(otherPos + FLD_MAX));
+                    continue merge;
+                }
+
+                // create
+                // assert (memUsed + MAX_STATION_SIZE < memAddr + MAX_STATION_SIZE * MAX_STATIONS) : "Too many stations (merge)";
+                long entryAddr = memUsed;
+                memUsed += entrySize;
+                count++;
+                UNSAFE.copyMemory(otherMemPos, entryAddr, entrySize);
+                UNSAFE.putLong(indexAddr + idx, entryAddr);
+            }
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            if (count == 0) {
+                return "{}";
+            }
+            var entries = new Entry[count];
+            int i = 0;
+            for (long pos = memAddr; pos < memUsed; pos += (int) (UNSAFE.getLong(pos) & 0xF8) + HEADER_SIZE + FIELDS_SIZE) {
+                entries[i++] = new Entry(pos);
+            }
+            Arrays.sort(entries);
+            var sb = new StringBuilder(count * 50);
+            sb.append('{');
+            entries[0].appendTo(sb);
+            for (int j = 1; j < entries.length; ++j) {
+                sb.append(", ");
+                entries[j].appendTo(sb);
+            }
+            sb.append('}');
+            return sb.toString();
+        }
+
+        static class Chunk {
+            final MemorySegment file;
+            final long chunkStartAddr;
+            final long chunkEndAddr;
+            final long fileStartAddr;
+            final long fileEndAddr;
+
+            Chunk(MemorySegment file, long chunkStartAddr, long chunkEndAddr, long fileStartAddr, long fileEndAddr) {
+                this.file = file;
+                this.chunkStartAddr = chunkStartAddr;
+                this.chunkEndAddr = chunkEndAddr;
+                this.fileStartAddr = fileStartAddr;
+                this.fileEndAddr = fileEndAddr;
+            }
+        }
+
+        static class Entry implements Comparable<Entry> {
+            private final long entryAddr;
+            private final int keySize;
+            private final String key;
+
+            Entry(long entryAddr) {
+                this.entryAddr = entryAddr;
+                this.keySize = (int) UNSAFE.getLong(entryAddr) & 0xF8;
+                try (var arena = Arena.ofConfined()) {
+                    var ms = arena.allocate(keySize + 8);
+                    UNSAFE.copyMemory(entryAddr + HEADER_SIZE, ms.address(), keySize);
+                    UNSAFE.copyMemory(entryAddr + 1, ms.address() + keySize, 7);
+                    this.key = ms.getUtf8String(0);
+                }
+            }
+
+            @Override
+            public int compareTo(Entry other) {
+                return key.compareTo(other.key);
+            }
+
+            @Override
+            public String toString() {
+                long pos = entryAddr + HEADER_SIZE + keySize;
+                return round(UNSAFE.getInt(pos + FLD_MIN))
+                        + "/" + round(((double) UNSAFE.getLong(pos + FLD_SUM)) / UNSAFE.getLong(pos + FLD_COUNT))
+                        + "/" + round(UNSAFE.getInt(pos + FLD_MAX));
+            }
+
+            void appendTo(StringBuilder sb) {
+                long pos = entryAddr + HEADER_SIZE + keySize;
+                sb.append(key);
+                sb.append('=');
+                sb.append(round(UNSAFE.getInt(pos + FLD_MIN)));
+                sb.append('/');
+                sb.append(round(((double) UNSAFE.getLong(pos + FLD_SUM)) / UNSAFE.getLong(pos + FLD_COUNT)));
+                sb.append('/');
+                sb.append(round(UNSAFE.getInt(pos + FLD_MAX)));
+            }
+
+            private static double round(double value) {
+                return Math.round(value) / 10.0;
+            }
+        }
+    }
+}

From d1cdb8587ccf89ee5df1e3b6e1e34e33f353dce7 Mon Sep 17 00:00:00 2001
From: Guruprasad Sridharan <guruprasad.sridharan@gmail.com>
Date: Thu, 1 Feb 2024 02:52:39 +0530
Subject: [PATCH 240/268] 1brc submission by godofwharf (#658)

* 1brc submission by godofwharf

* Fix prepare script

* Modify shebang

* Fix formatting

* Remove unused FastHashMap implementation
---
 calculate_average_godofwharf.sh               |  19 +
 prepare_godofwharf.sh                         |  19 +
 .../onebrc/CalculateAverage_godofwharf.java   | 588 ++++++++++++++++++
 3 files changed, 626 insertions(+)
 create mode 100755 calculate_average_godofwharf.sh
 create mode 100755 prepare_godofwharf.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_godofwharf.java

diff --git a/calculate_average_godofwharf.sh b/calculate_average_godofwharf.sh
new file mode 100755
index 000000000..b8df7a052
--- /dev/null
+++ b/calculate_average_godofwharf.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview --add-modules jdk.incubator.vector -DpageSize=262144 -XX:+UseParallelGC -Xms2600m -XX:ParallelGCThreads=8 -XX:Tier4CompileThreshold=1000 -XX:Tier3CompileThreshold=500 -XX:Tier3CompileThreshold=250 -Dthreads=9 -Djava.util.concurrent.ForkJoinPool.common.parallelism=9"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_godofwharf 2>/dev/null
\ No newline at end of file
diff --git a/prepare_godofwharf.sh b/prepare_godofwharf.sh
new file mode 100755
index 000000000..907c86d8d
--- /dev/null
+++ b/prepare_godofwharf.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-tem 1>&2
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_godofwharf.java b/src/main/java/dev/morling/onebrc/CalculateAverage_godofwharf.java
new file mode 100644
index 000000000..3d3e0a75b
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_godofwharf.java
@@ -0,0 +1,588 @@
+package dev.morling.onebrc;
+
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.Vector;
+import jdk.incubator.vector.VectorSpecies;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.lang.management.ManagementFactory;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.function.BiConsumer;
+import java.util.stream.IntStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class CalculateAverage_godofwharf {
+    private static final String FILE = "./measurements.txt";
+    private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("debug", "false"));
+    private static final int NCPU = Runtime.getRuntime().availableProcessors();
+
+    private static final VectorSpecies<Byte> PREFERRED_SPECIES = VectorSpecies.ofPreferred(byte.class);
+
+    private static final Vector<Byte> NEW_LINE_VEC = PREFERRED_SPECIES.broadcast('\n');
+    // This array is used for quick conversion of fractional part
+    private static final double[] DOUBLES = new double[]{ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 };
+    // This array is used for quick conversion from ASCII to digit
+    private static final int[] DIGIT_LOOKUP = new int[]{
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, 0, 1,
+            2, 3, 4, 5, 6, 7, 8, 9, -1, -1 };
+    private static final int MAX_STR_LEN = 108;
+    private static final int DEFAULT_HASH_TBL_SIZE = 4096;
+    private static final int DEFAULT_PAGE_SIZE = 8_388_608; // 8 MB
+    private static final int PAGE_SIZE = Integer.parseInt(System.getProperty("pageSize", STR."\{DEFAULT_PAGE_SIZE}"));
+
+    public static void main(String[] args) throws Exception {
+        long startTimeMs = System.currentTimeMillis();
+        Map<String, MeasurementAggregator> measurements = compute();
+        long time1 = System.nanoTime();
+        System.out.println(measurements);
+        printDebugMessage("Print took %d ns%n", (System.nanoTime() - time1));
+        printDebugMessage("Took %d ms%n", System.currentTimeMillis() - startTimeMs);
+        printDebugMessage("Time spent on GC=%d ms%n", ManagementFactory.getGarbageCollectorMXBeans().get(0).getCollectionTime());
+        System.exit(0);
+    }
+
+    private static Map<String, MeasurementAggregator> compute() throws Exception {
+        int nThreads = Integer.parseInt(
+                System.getProperty("threads", STR."\{NCPU}"));
+        printDebugMessage("Running program with %d threads %n", nThreads);
+        Job job = new Job(nThreads - 1);
+        job.compute(FILE);
+        return job.sort();
+    }
+
+    public static class Job {
+        private final int nThreads;
+        private final State[] threadLocalStates;
+        private final Map<String, MeasurementAggregator> globalMap = new ConcurrentHashMap<>(DEFAULT_HASH_TBL_SIZE);
+        private final ExecutorService executorService;
+
+        public Job(final int nThreads) {
+            this.threadLocalStates = new State[(nThreads << 4)];
+            IntStream.range(0, nThreads << 4)
+                    .forEach(i -> threadLocalStates[i] = new State());
+            this.nThreads = nThreads;
+            this.executorService = Executors.newFixedThreadPool(nThreads);
+        }
+
+        public void compute(final String path) throws Exception {
+            // Create a random access file so that we can map the contents of the file into native memory for faster access
+            try (RandomAccessFile file = new RandomAccessFile(path, "r")) {
+                // Create a memory segment for the entire file
+                MemorySegment globalSegment = file.getChannel().map(
+                        FileChannel.MapMode.READ_ONLY, 0, file.length(), Arena.global());
+                long fileLength = file.length();
+                // Ensure that the split length never exceeds Integer.MAX_VALUE. This is because ByteBuffers cannot
+                // be larger than 2 GiB.
+                int splitLength = (int) Math.min(Integer.MAX_VALUE, Math.max(PAGE_SIZE, Math.rint(fileLength * 1.0 / nThreads)));
+                printDebugMessage("fileLength = %d, splitLength = %d%n", file.length(), splitLength);
+                long time1 = System.nanoTime();
+                // Break the file into multiple splits. One thread would process one split.
+                // This routine makes sure that the splits are uniformly sized to the best extent possible.
+                // Each split would either end with a '\n' character or EOF
+                List<Split> splits = breakFileIntoSplits(file, splitLength, PAGE_SIZE, globalSegment, false);
+                printDebugMessage("Number of splits = %d, splits = [%s]%n", splits.size(), splits);
+                printDebugMessage("Splits calculation took %d ns%n", System.nanoTime() - time1);
+                // consume splits in parallel using the common fork join pool
+                long time = System.nanoTime();
+                List<Future<?>> futures = new ArrayList<>(splits.size() * 2);
+                splits
+                        .forEach(split -> {
+                            // process splits concurrently using a thread pool
+                            futures.add(executorService.submit(() -> {
+                                MemorySegment splitSegment = globalSegment.asSlice(split.offset, split.length);
+                                splitSegment.load();
+                                int tid = (int) Thread.currentThread().threadId();
+                                byte[] currentPage = new byte[PAGE_SIZE + MAX_STR_LEN];
+                                // iterate over each page in split
+                                for (Page page : split.pages) {
+                                    // this byte buffer should end with '\n' or EOF
+                                    MemorySegment segment = globalSegment.asSlice(page.offset, page.length);
+                                    MemorySegment.copy(segment, ValueLayout.JAVA_BYTE, 0L, currentPage, 0, (int) page.length);
+                                    SearchResult searchResult = findNewLinesVectorized(currentPage, (int) page.length);
+                                    int prevOffset = 0;
+                                    int j = 0;
+                                    // iterate over search results
+                                    while (j < searchResult.len) {
+                                        int curOffset = searchResult.offsets[j];
+                                        byte ch1 = currentPage[curOffset - 4];
+                                        byte ch2 = currentPage[curOffset - 5];
+                                        int temperatureLen = 5;
+                                        if (ch1 == ';') {
+                                            temperatureLen = 3;
+                                        }
+                                        else if (ch2 == ';') {
+                                            temperatureLen = 4;
+                                        }
+                                        int lineLength = curOffset - prevOffset;
+                                        int stationLen = lineLength - temperatureLen - 1;
+                                        byte[] station = new byte[stationLen];
+                                        System.arraycopy(currentPage, prevOffset, station, 0, stationLen);
+                                        int hashcode = Arrays.hashCode(station);
+                                        double temperature = NumberUtils.parseDouble2(currentPage, prevOffset + stationLen + 1, temperatureLen);
+                                        Measurement m = new Measurement(station, temperature, hashcode);
+                                        threadLocalStates[tid].update(m);
+                                        prevOffset = curOffset + 1;
+                                        j++;
+                                    }
+                                    // Explicitly commented out because unload seems to take a lot of time
+                                    // segment.unload();
+                                }
+                                mergeInternal(threadLocalStates[tid]);
+                            }));
+                        });
+                for (Future<?> future : futures) {
+                    future.get();
+                }
+                printDebugMessage("Aggregate took %d ns%n", (System.nanoTime() - time));
+            }
+        }
+
+        private void mergeInternal(final State state) {
+            state.state.forEach((k, v) -> {
+                globalMap.compute(k.toString(), (ignored, agg) -> {
+                    if (agg == null) {
+                        agg = v;
+                    }
+                    else {
+                        agg.merge(v);
+                    }
+                    return agg;
+                });
+            });
+        }
+
+        public Map<String, MeasurementAggregator> sort() {
+            long time = System.nanoTime();
+            Map<String, MeasurementAggregator> sortedMap = new TreeMap<>(globalMap);
+            printDebugMessage("Tree map construction took %d ns%n", (System.nanoTime() - time));
+            return sortedMap;
+        }
+
+        private static LineMetadata findNextOccurrenceOfNewLine(final ByteBuffer buffer,
+                                                                final int capacity,
+                                                                final int offset) {
+            int maxLen = capacity - offset;
+            byte[] src = new byte[Math.min(MAX_STR_LEN, maxLen)];
+            byte[] station = new byte[src.length];
+            byte[] temperature = new byte[5];
+            buffer.position(offset);
+            buffer.get(src);
+            int i = 0;
+            int j = 0;
+            int k = 0;
+            boolean isAscii = true;
+            boolean afterDelim = false;
+            int hashCode = 0;
+            for (; i < src.length; i++) {
+                byte b = src[i];
+                if (b < 0) {
+                    isAscii = false;
+                }
+                if (!afterDelim && b != '\n') {
+                    if (b == ';') {
+                        afterDelim = true;
+                    }
+                    else {
+                        hashCode = hashCode * 31 + b;
+                        station[j++] = b;
+                    }
+                }
+                else if (b != '\n') {
+                    temperature[k++] = b;
+                }
+                else {
+                    return new LineMetadata(
+                            station, temperature, j, k, offset + i + 1, hashCode, isAscii);
+                }
+            }
+            if (i == 0 & j == 0 && k == 0) {
+                hashCode = -1;
+            }
+            return new LineMetadata(
+                    station, temperature, j, k, offset + i, hashCode, isAscii);
+        }
+
+        private static SearchResult findNewLinesVectorized(final byte[] page,
+                                                           final int pageLen) {
+            SearchResult ret = new SearchResult(new int[pageLen / 5], 0);
+            VectorSpecies<Byte> species = PREFERRED_SPECIES;
+            int loopBound = pageLen - species.length() * 4;
+            int i = 0;
+            int j = 0;
+            while (j < loopBound) {
+                Vector<Byte> v1 = ByteVector.fromArray(species, page, j);
+                Vector<Byte> v2 = ByteVector.fromArray(species, page, j + species.length());
+                Vector<Byte> v3 = ByteVector.fromArray(species, page, j + species.length() * 2);
+                Vector<Byte> v4 = ByteVector.fromArray(species, page, j + species.length() * 3);
+                long l1 = NEW_LINE_VEC.eq(v1).toLong();
+                long l2 = NEW_LINE_VEC.eq(v2).toLong();
+                long l3 = NEW_LINE_VEC.eq(v3).toLong();
+                long l4 = NEW_LINE_VEC.eq(v4).toLong();
+                long r1 = l1 & 0xFFFFFFFFL | (l2 << species.length());
+                long r2 = l3 & 0xFFFFFFFFL | (l4 << (species.length()));
+                int b1 = Long.bitCount(r1);
+                int b2 = Long.bitCount(r2);
+                int k = i;
+                int it = b1;
+                while (it > 0) {
+                    int idx = Long.numberOfTrailingZeros(r1);
+                    ret.offsets[k++] = j + idx;
+                    r1 &= (r1 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r1);
+                    ret.offsets[k++] = j + idx;
+                    r1 &= (r1 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r1);
+                    ret.offsets[k++] = j + idx;
+                    r1 &= (r1 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r1);
+                    ret.offsets[k++] = j + idx;
+                    r1 &= (r1 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r1);
+                    ret.offsets[k++] = j + idx;
+                    r1 &= (r1 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r1);
+                    ret.offsets[k++] = j + idx;
+                    r1 &= (r1 - 1);
+                    it--;
+                }
+                i += b1;
+                j += species.length() * 2;
+                k = i;
+                it = b2;
+                while (it > 0) {
+                    int idx = Long.numberOfTrailingZeros(r2);
+                    ret.offsets[k++] = j + idx;
+                    r2 &= (r2 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r2);
+                    ret.offsets[k++] = j + idx;
+                    r2 &= (r2 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r2);
+                    ret.offsets[k++] = j + idx;
+                    r2 &= (r2 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r2);
+                    ret.offsets[k++] = j + idx;
+                    r2 &= (r2 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r2);
+                    ret.offsets[k++] = j + idx;
+                    r2 &= (r2 - 1);
+                    it--;
+                    idx = Long.numberOfTrailingZeros(r2);
+                    ret.offsets[k++] = j + idx;
+                    r2 &= (r2 - 1);
+                    it--;
+                }
+                i += b2;
+                j += species.length() * 2;
+            }
+
+            // tail loop
+            while (j < pageLen) {
+                byte b = page[j];
+                if (b == '\n') {
+                    ret.offsets[i++] = j;
+                }
+                j++;
+            }
+            ret.len = i;
+            return ret;
+        }
+
+        private static List<Split> breakFileIntoSplits(final RandomAccessFile file,
+                                                       final int splitLength,
+                                                       final int pageLength,
+                                                       final MemorySegment memorySegment,
+                                                       final boolean enableChecks)
+                throws IOException {
+            final List<Split> splits = new ArrayList<>();
+            // Try to break the file into multiple splits while ensuring that each split has at least splitLength bytes
+            // and ends with '\n' or EOF
+            for (long i = 0; i < file.length();) {
+                long splitStartOffset = i;
+                long splitEndOffset = Math.min(file.length(), splitStartOffset + splitLength); // not inclusive
+                if (splitEndOffset == file.length()) { // reached EOF
+                    List<Page> pages = breakSplitIntoPages(splitStartOffset, splitEndOffset, pageLength, memorySegment, enableChecks);
+                    splits.add(new Split(splitStartOffset, splitEndOffset - splitStartOffset, pages));
+                    break;
+                }
+                // Look past the end offset to find next '\n' or EOF
+                long segmentLength = Math.min(MAX_STR_LEN, file.length() - i);
+                // Create a new memory segment for reading contents beyond splitEndOffset
+                MemorySegment lookahead = memorySegment.asSlice(splitEndOffset, segmentLength);
+                ByteBuffer bb = lookahead.asByteBuffer();
+                // Find the next offset which has either '\n' or EOF
+                LineMetadata lineMetadata = findNextOccurrenceOfNewLine(bb, (int) segmentLength, 0);
+                splitEndOffset += lineMetadata.offset;
+                if (enableChecks &&
+                        memorySegment.asSlice(splitEndOffset - 1, 1).asByteBuffer().get(0) != '\n') {
+                    throw new IllegalStateException("Page doesn't end with NL char");
+                }
+                // Break the split further into multiple pages based on pageLength
+                List<Page> pages = breakSplitIntoPages(splitStartOffset, splitEndOffset, pageLength, memorySegment, enableChecks);
+                splits.add(new Split(splitStartOffset, splitEndOffset - splitStartOffset, pages));
+                i = splitEndOffset;
+                lookahead.unload();
+            }
+            return splits;
+        }
+
+        private static List<Page> breakSplitIntoPages(final long splitStartOffset,
+                                                      final long splitEndOffset,
+                                                      final int pageLength,
+                                                      final MemorySegment memorySegment,
+                                                      final boolean enableChecks) {
+            List<Page> pages = new ArrayList<>();
+            for (long i = splitStartOffset; i < splitEndOffset;) {
+                long pageStartOffset = i;
+                long pageEndOffset = Math.min(splitEndOffset, pageStartOffset + pageLength); // not inclusive
+                if (pageEndOffset == splitEndOffset) {
+                    pages.add(new Page(pageStartOffset, pageEndOffset - pageStartOffset));
+                    break;
+                }
+                // Look past the end offset to find next '\n' till we reach the end of split
+                long lookaheadLength = Math.min(MAX_STR_LEN, splitEndOffset - i);
+                MemorySegment lookahead = memorySegment.asSlice(pageEndOffset, lookaheadLength);
+                ByteBuffer bb = lookahead.asByteBuffer();
+                // Find next offset which has either '\n' or the end of split
+                LineMetadata lineMetadata = findNextOccurrenceOfNewLine(bb, (int) lookaheadLength, 0);
+                pageEndOffset += lineMetadata.offset;
+                if (enableChecks &&
+                        memorySegment.asSlice(pageEndOffset - 1, 1).asByteBuffer().get(0) != '\n') {
+                    throw new IllegalStateException("Page doesn't end with NL char");
+                }
+                pages.add(new Page(pageStartOffset, pageEndOffset - pageStartOffset));
+                i = pageEndOffset;
+                lookahead.unload();
+            }
+            return pages;
+        }
+    }
+
+    public static class State {
+        private final Map<AggregationKey, MeasurementAggregator> state;
+
+        public State() {
+            this.state = new HashMap<>(DEFAULT_HASH_TBL_SIZE);
+            // insert a DUMMY key to prime the hashmap for usage
+            AggregationKey dummy = new AggregationKey("DUMMY".getBytes(UTF_8), -1);
+            this.state.put(dummy, null);
+            this.state.remove(dummy);
+        }
+
+        public void update(final Measurement m) {
+            MeasurementAggregator agg = state.get(m.aggregationKey);
+            if (agg == null) {
+                state.put(m.aggregationKey, new MeasurementAggregator(m.temperature, m.temperature, m.temperature, 1L));
+                return;
+            }
+            agg.count++;
+            agg.min = m.temperature <= agg.min ? m.temperature : agg.min;
+            agg.max = m.temperature >= agg.max ? m.temperature : agg.max;
+            agg.sum += m.temperature;
+        }
+
+        public static class AggregationKey {
+            private final byte[] station;
+            private final int hashCode;
+
+            public AggregationKey(final byte[] station,
+                                  final int hashCode) {
+                this.station = station;
+                this.hashCode = hashCode;
+            }
+
+            @Override
+            public String toString() {
+                return new String(station, UTF_8);
+            }
+
+            @Override
+            public int hashCode() {
+                return hashCode;
+            }
+
+            @Override
+            public boolean equals(Object other) {
+                if (!(other instanceof AggregationKey)) {
+                    return false;
+                }
+                AggregationKey sk = (AggregationKey) other;
+                return station.length == sk.station.length && Arrays.mismatch(station, sk.station) < 0;
+            }
+        }
+    }
+
+    public static class MeasurementAggregator {
+        private double min;
+        private double max;
+        private double sum;
+        private long count;
+
+        public MeasurementAggregator(final double min,
+                                     final double max,
+                                     final double sum,
+                                     final long count) {
+            this.min = min;
+            this.max = max;
+            this.sum = sum;
+            this.count = count;
+        }
+
+        public String toString() {
+            double min1 = round(min);
+            double max1 = round(max);
+            double mean = round(round(sum) / count);
+            return min1 + "/" + mean + "/" + max1;
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+
+        private void merge(final MeasurementAggregator m2) {
+            count += m2.count;
+            min = Math.min(min, m2.min);
+            max = Math.max(max, m2.max);
+            sum += m2.sum;
+        }
+    }
+
+    public static class NumberUtils {
+        public static int toDigit(final char c) {
+            return DIGIT_LOOKUP[c];
+        }
+
+        public static int fastMul10(final int i) {
+            return (i << 1) + (i << 3);
+        }
+
+        public static double parseDouble2(final byte[] b,
+                                          final int offset,
+                                          final int len) {
+            try {
+                char ch0 = (char) b[offset];
+                char ch1 = (char) b[offset + 1];
+                char ch2 = (char) b[offset + 2];
+                char ch3 = len > 3 ? (char) b[offset + 3] : ' ';
+                char ch4 = len > 4 ? (char) b[offset + 4] : ' ';
+                if (len == 3) {
+                    int decimal = toDigit(ch0);
+                    double fractional = DOUBLES[toDigit(ch2)];
+                    return decimal + fractional;
+                }
+                else if (len == 4) {
+                    // -1.2 or 11.2
+                    int decimal = (ch0 == '-' ? toDigit(ch1) : (fastMul10(toDigit(ch0)) + toDigit(ch1)));
+                    double fractional = DOUBLES[toDigit(ch3)];
+                    if (ch0 == '-') {
+                        return Math.negateExact(decimal) - fractional;
+                    }
+                    else {
+                        return decimal + fractional;
+                    }
+                }
+                else {
+                    int decimal = fastMul10(toDigit(ch1)) + toDigit(ch2);
+                    double fractional = DOUBLES[toDigit(ch4)];
+                    return Math.negateExact(decimal) - fractional;
+                }
+            }
+            catch (ArrayIndexOutOfBoundsException e) {
+                printDebugMessage("Array index out of bounds for string: %s%n", new String(b, 0, len));
+                throw new RuntimeException(e);
+            }
+            catch (StringIndexOutOfBoundsException e) {
+                printDebugMessage("String index out of bounds for string: %s%n", new String(b, 0, len));
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    // record classes
+    record Measurement(byte[] station,
+                       double temperature,
+                       int hash,
+                       State.AggregationKey aggregationKey) {
+
+    public Measurement(byte[] station,
+                       double temperature,
+                       int hashCode) {
+            this(station,
+                    temperature,
+                    hashCode,
+                    new State.AggregationKey(station, hashCode));
+        }
+
+    }
+
+    record LineMetadata(byte[] station,
+                        byte[] temperature,
+                        int stationLen,
+                        int temperatureLen,
+                        int offset,
+                        int precomputedHashCode, boolean isAscii) {
+    }
+
+    record Split(long offset, long length, List<Page> pages) {
+    }
+
+    record Page(long offset, long length) {
+    }
+
+    public static class SearchResult {
+        private int[] offsets;
+        private int len;
+
+        public SearchResult(final int[] offsets,
+                            final int len) {
+            this.offsets = offsets;
+            this.len = len;
+        }
+    }
+
+    private static void printDebugMessage(final String message,
+                                          final Object... args) {
+        if (DEBUG) {
+            System.err.printf(message, args);
+        }
+    }
+}

From 722773378a431b8d257532fdcbfe4363063ef226 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 22:41:52 +0100
Subject: [PATCH 241/268] Leader board update

---
 README.md | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 61a791790..afd721ec8 100644
--- a/README.md
+++ b/README.md
@@ -48,26 +48,28 @@ These are the results from running all entries into the challenge on eight cores
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
 | 1 | 00:01.645 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
-| 2 | 00:01.832 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2* | 00:01.832 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2* | 00:01.851 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
 | 3 | 00:01.880 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
-|   | 00:01.926 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-|   | 00:01.970 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:01.921 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:02.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java)| 21.0.2-graal | [Stephen Von Worley](https://github.com/stephenvonworley) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 |   | 00:02.205 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:02.332 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
 |   | 00:02.367 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
+|   | 00:02.507 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java)| 21.0.1-open | [gonix](https://github.com/gonix) | uses Unsafe |
 |   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:03.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.31-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
+|   | 00:03.095 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.2-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.210 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
 |   | 00:03.431 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
-|   | 00:03.518 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
+|   | 00:03.469 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.2-graal | [Elliot Barlas](https://github.com/ebarlas) | GraalVM native binary, uses Unsafe |
 |   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.785 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.2-graal | [zerninv](https://github.com/zerninv) | GraalVM native binary, uses Unsafe |
 |   | 00:03.820 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.2-graal | [John Ziamos](https://github.com/iziamos) | GraalVM native binary, uses Unsafe |
 |   | 00:03.824 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
-|   | 00:03.854 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
@@ -76,10 +78,10 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:04.474 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gamlerhart.java)| 21.0.1-open | [Roman Stoffel](https://github.com/gamlerhart) |  |
 |   | 00:04.676 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_plevart.java)| 21.0.2-tem | [Peter Levart](https://github.com/plevart) |  |
 |   | 00:04.684 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gigiblender.java)| 21.0.1-open | [Florin Blanaru](https://github.com/gigiblender) | uses Unsafe |
+|   | 00:04.701 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:04.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) | uses Unsafe |
 |   | 00:04.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_parkertimmins.java)| 21.0.1-open | [Parker Timmins](https://github.com/parkertimmins) |  |
 |   | 00:04.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_shipilev.java)| 21.0.1-open | [Aleksey Shipilëv](https://github.com/shipilev) |  |
-|   | 00:04.886 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolousfast.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolousfast) |  |
 |   | 00:04.920 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemana.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) |  |
 |   | 00:05.077 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jonathanaotearoa.java)| 21.0.2-graal | [Jonathan Wright](https://github.com/jonathan-aotearoa) | GraalVM native binary, uses Unsafe |
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
@@ -126,10 +128,12 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:11.156 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_YannMoisan.java)| java | [Yann Moisan](https://github.com/YannMoisan) |  |
 |   | 00:11.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_palmr.java)| 21.0.1-open | [Nick Palmer](https://github.com/palmr) |  |
 |   | 00:11.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_karthikeyan97.java)| 21.0.1-open | [karthikeyan97](https://github.com/karthikeyan97) | uses Unsafe |
+|   | 00:11.363 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_godofwharf.java)| 21.0.2-tem | [Guruprasad Sridharan](https://github.com/godofwharf) |  |
 |   | 00:11.405 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_imrafaelmerino.java)| 21.0.1-graal | [Rafael Merino García](https://github.com/imrafaelmerino) |  |
 |   | 00:11.406 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielfoo.java)| 21.0.1-graal | [gabrielfoo](https://github.com/gabrielfoo) |  |
 |   | 00:11.433 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jatingala.java)| 21.0.1-graal | [Jatin Gala](https://github.com/jatingala) |  |
 |   | 00:11.505 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bufistov.java)| 21.0.1-open | [Dmitry Bufistov](https://github.com/dmitry-midokura) | uses Unsafe |
+|   | 00:11.744 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_slovdahl.java)| 21.0.2-tem | [Sebastian Lövdahl](https://github.com/slovdahl) |  |
 |   | 00:11.805 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_coolmineman.java)| 21.0.1-graal | [Cool_Mineman](https://github.com/coolmineman) |  |
 |   | 00:11.934 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenvaneerde.java)| 21.0.1-open | [arjenvaneerde](https://github.com/arjenvaneerde) |  |
 |   | 00:12.220 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_richardstartin.java)| 21.0.1-open | [Richard Startin](https://github.com/richardstartin) |  |
@@ -138,6 +142,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:12.800 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yonatang.java)| java | [Yonatan Graber](https://github.com/yonatang) |  |
 |   | 00:13.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thanhtrinity.java)| 21.0.1-graal | [Thanh Duong](https://github.com/thanhtrinity) |  |
 |   | 00:13.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ianopolous.java)| 21.0.1-open | [Dr Ian Preston](https://github.com/ianopolous) |  |
+|   | 00:13.729 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cb0s.java)| java | [Cedric Boes](https://github.com/cb0s) |  |
 |   | 00:13.817 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_entangled90.java)| 21.0.1-open | [Carlo](https://github.com/entangled90) |  |
 |   | 00:14.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_eriklumme.java)| 21.0.1-graal | [eriklumme](https://github.com/eriklumme) |  |
 |   | 00:14.772 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kevinmcmurtrie.java)| 21.0.1-open | [Kevin McMurtrie](https://github.com/kevinmcmurtrie) |  |
@@ -149,6 +154,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:16.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bytesfellow.java)| 21.0.1-open | [Aleksei](https://github.com/bytesfellow) |  |
 |   | 00:16.953 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gauravdeshmukh.java)| 21.0.1-open | [Gaurav Anantrao Deshmukh](https://github.com/gauravdeshmukh) |  |
 |   | 00:17.046 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dkarampi.java)| 21.0.1-open | [Dimitris Karampinas](https://github.com/dkarampi) |  |
+|   | 00:17.086 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_breejesh.java)| java | [Breejesh Rathod](https://github.com/breejesh) |  |
 |   | 00:17.490 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kgeri.java)| 21.0.1-open | [Gergely Kiss](https://github.com/kgeri) |  |
 |   | 00:17.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tkosachev.java)| 21.0.1-open | [tkosachev](https://github.com/tkosachev) |  |
 |   | 00:17.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_faridtmammadov.java)| 21.0.1-open | [Farid](https://github.com/faridtmammadov) |  |
@@ -168,6 +174,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
 |   | 00:22.687 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java)| 21.0.1-graal | [Panagiotis Drakatos](https://github.com/PanagiotisDrakatos) | GraalVM native binary |
 |   | 00:24.986 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
+|   | 00:25.064 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_sudhirtumati.java)| 21.0.2-open | [Sudhir Tumati](https://github.com/sudhirtumati) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |
 |   | 00:28.381 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_bjhara.java)| 21.0.1-open | [Hampus](https://github.com/bjhara) |  |
 |   | 00:29.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_xpmatteo.java)| 21.0.1-open | [Matteo Vaccari](https://github.com/xpmatteo) |  |

From 7dcf7071ee435bcf5b9321ff7fdaa4f92f0ff681 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Wed, 31 Jan 2024 22:53:37 +0100
Subject: [PATCH 242/268] Update pull_request_template.md

---
 .github/pull_request_template.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 9b55c8f63..2035158dd 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,3 +1,7 @@
+**NOTE:** This challenge closes for submissions on Jan 31 23:59:59 UTC. No new pull requests for adding submissions will be accepted after that time.
+Any pending pull requests will be reviewed over the next few days, as described [here](https://github.com/gunnarmorling/1brc/discussions/687).
+The final leader board will be published by Feb 5.
+
 #### Check List:
 
 - [ ] You have run `./mvnw verify` and the project builds successfully

From 4debc7c5dd1b00f0dbc1822425cda727b250cad8 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 10:45:19 +0100
Subject: [PATCH 243/268] 10 runs for evaluating top entries

---
 evaluate.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluate.sh b/evaluate.sh
index c0be8b50b..5f89b6cf8 100755
--- a/evaluate.sh
+++ b/evaluate.sh
@@ -34,7 +34,7 @@ BOLD_YELLOW='\033[1;33m'
 RESET='\033[0m' # No Color
 
 MEASUREMENTS_FILE="measurements_1B.txt"
-RUNS=5
+RUNS=10
 DEFAULT_JAVA_VERSION="21.0.1-open"
 : "${BUILD_JAVA_VERSION:=21.0.1-open}"
 RUN_TIME_LIMIT=300 # seconds

From 241d42ca6609b6bc32b403b1f4ee4d1fe6e325f8 Mon Sep 17 00:00:00 2001
From: Thomas Wuerthinger <thomas.wuerthinger@oracle.com>
Date: Thu, 1 Feb 2024 10:57:05 +0100
Subject: [PATCH 244/268] One last improvement for thomaswue (#702)

* Combine <8 and 8-16 cases into one case.

* Adopt mask-based approach for the <16 length city fast path (idea of Van Phu Do).

* Slightly improved code layout.

* Update perf number.
---
 .../onebrc/CalculateAverage_thomaswue.java    | 127 +++++++++---------
 1 file changed, 66 insertions(+), 61 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
index dc4df0cc9..8e311fa89 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java
@@ -27,11 +27,14 @@
  * split into 3 parts and cursors for each of those parts are processing the segment simultaneously in the same thread.
  * Results are accumulated into {@link Result} objects and a tree map is used to sequentially accumulate the results in
  * the end.
- * Runs in 0.39s on an Intel i9-13900K.
+ * Runs in 0.31 on an Intel i9-13900K while the reference implementation takes 120.37s.
  * Credit:
  *  Quan Anh Mai for branchless number parsing code
  *  Alfonso² Peterssen for suggesting memory mapping with unsafe and the subprocess idea
  *  Artsiom Korzun for showing the benefits of work stealing at 2MB segments instead of equal split between workers
+ *  Jaromir Hamala for showing that avoiding the branch misprediction between <8 and 8-16 cases is a big win even if
+ *  more work is performed
+ *  Van Phu DO for demonstrating the lookup tables based on masks instead of bit shifting
  */
 public class CalculateAverage_thomaswue {
     private static final String FILE = "./measurements.txt";
@@ -141,9 +144,15 @@ private static void parseLoop(AtomicLong counter, long fileEnd, long fileStart,
                 long delimiterMask1 = findDelimiter(word1);
                 long delimiterMask2 = findDelimiter(word2);
                 long delimiterMask3 = findDelimiter(word3);
-                Result existingResult1 = findResult(word1, delimiterMask1, scanner1, results, collectedResults);
-                Result existingResult2 = findResult(word2, delimiterMask2, scanner2, results, collectedResults);
-                Result existingResult3 = findResult(word3, delimiterMask3, scanner3, results, collectedResults);
+                long word1b = scanner1.getLongAt(scanner1.pos() + 8);
+                long word2b = scanner2.getLongAt(scanner2.pos() + 8);
+                long word3b = scanner3.getLongAt(scanner3.pos() + 8);
+                long delimiterMask1b = findDelimiter(word1b);
+                long delimiterMask2b = findDelimiter(word2b);
+                long delimiterMask3b = findDelimiter(word3b);
+                Result existingResult1 = findResult(word1, delimiterMask1, word1b, delimiterMask1b, scanner1, results, collectedResults);
+                Result existingResult2 = findResult(word2, delimiterMask2, word2b, delimiterMask2b, scanner2, results, collectedResults);
+                Result existingResult3 = findResult(word3, delimiterMask3, word3b, delimiterMask3b, scanner3, results, collectedResults);
                 long number1 = scanNumber(scanner1);
                 long number2 = scanNumber(scanner2);
                 long number3 = scanNumber(scanner3);
@@ -155,76 +164,70 @@ private static void parseLoop(AtomicLong counter, long fileEnd, long fileStart,
             while (scanner1.hasNext()) {
                 long word = scanner1.getLong();
                 long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner1, results, collectedResults), scanNumber(scanner1));
+                long wordB = scanner1.getLongAt(scanner1.pos() + 8);
+                long posB = findDelimiter(wordB);
+                record(findResult(word, pos, wordB, posB, scanner1, results, collectedResults), scanNumber(scanner1));
             }
             while (scanner2.hasNext()) {
                 long word = scanner2.getLong();
                 long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner2, results, collectedResults), scanNumber(scanner2));
+                long wordB = scanner2.getLongAt(scanner2.pos() + 8);
+                long posB = findDelimiter(wordB);
+                record(findResult(word, pos, wordB, posB, scanner2, results, collectedResults), scanNumber(scanner2));
             }
             while (scanner3.hasNext()) {
                 long word = scanner3.getLong();
                 long pos = findDelimiter(word);
-                record(findResult(word, pos, scanner3, results, collectedResults), scanNumber(scanner3));
+                long wordB = scanner3.getLongAt(scanner3.pos() + 8);
+                long posB = findDelimiter(wordB);
+                record(findResult(word, pos, wordB, posB, scanner3, results, collectedResults), scanNumber(scanner3));
             }
         }
     }
 
-    private static Result findResult(long initialWord, long initialDelimiterMask, Scanner scanner, Result[] results, List<Result> collectedResults) {
+    private static final long[] MASK1 = new long[]{ 0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFFFL, 0xFFFFFFFFFFFFL, 0xFFFFFFFFFFFFFFL, 0xFFFFFFFFFFFFFFFFL,
+            0xFFFFFFFFFFFFFFFFL };
+    private static final long[] MASK2 = new long[]{ 0x00L, 0x00L, 0x00L, 0x00L, 0x00L, 0x00L, 0x00L, 0x00L, 0xFFFFFFFFFFFFFFFFL };
+
+    private static Result findResult(long initialWord, long initialDelimiterMask, long wordB, long delimiterMaskB, Scanner scanner, Result[] results,
+                                     List<Result> collectedResults) {
         Result existingResult;
         long word = initialWord;
         long delimiterMask = initialDelimiterMask;
         long hash;
         long nameAddress = scanner.pos();
-
-        // Search for ';', one long at a time. There are two common cases that a specially treated:
-        // (b) the ';' is found in the first 16 bytes
-        if (delimiterMask != 0) {
-            // Special case for when the ';' is found in the first 8 bytes.
-            int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
-            word = (word << (63 - trailingZeros));
-            scanner.add(trailingZeros >>> 3);
-            hash = word;
+        long word2 = wordB;
+        long delimiterMask2 = delimiterMaskB;
+        if ((delimiterMask | delimiterMask2) != 0) {
+            int letterCount1 = Long.numberOfTrailingZeros(delimiterMask) >>> 3; // value between 1 and 8
+            int letterCount2 = Long.numberOfTrailingZeros(delimiterMask2) >>> 3; // value between 0 and 8
+            long mask = MASK2[letterCount1];
+            word = word & MASK1[letterCount1];
+            word2 = mask & word2 & MASK1[letterCount2];
+            hash = word ^ word2;
             existingResult = results[hashToIndex(hash, results)];
-            if (existingResult != null && existingResult.lastNameLong == word) {
+            scanner.add(letterCount1 + (letterCount2 & mask));
+            if (existingResult != null && existingResult.firstNameWord == word && existingResult.secondNameWord == word2) {
                 return existingResult;
             }
         }
         else {
-            // Special case for when the ';' is found in bytes 9-16.
-            hash = word;
-            long prevWord = word;
-            scanner.add(8);
-            word = scanner.getLong();
-            delimiterMask = findDelimiter(word);
-            if (delimiterMask != 0) {
-                int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
-                word = (word << (63 - trailingZeros));
-                scanner.add(trailingZeros >>> 3);
-                hash ^= word;
-                existingResult = results[hashToIndex(hash, results)];
-                if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
-                    return existingResult;
+            // Slow-path for when the ';' could not be found in the first 16 bytes.
+            hash = word ^ word2;
+            scanner.add(16);
+            while (true) {
+                word = scanner.getLong();
+                delimiterMask = findDelimiter(word);
+                if (delimiterMask != 0) {
+                    int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
+                    word = (word << (63 - trailingZeros));
+                    scanner.add(trailingZeros >>> 3);
+                    hash ^= word;
+                    break;
                 }
-            }
-            else {
-                // Slow-path for when the ';' could not be found in the first 16 bytes.
-                scanner.add(8);
-                hash ^= word;
-                while (true) {
-                    word = scanner.getLong();
-                    delimiterMask = findDelimiter(word);
-                    if (delimiterMask != 0) {
-                        int trailingZeros = Long.numberOfTrailingZeros(delimiterMask);
-                        word = (word << (63 - trailingZeros));
-                        scanner.add(trailingZeros >>> 3);
-                        hash ^= word;
-                        break;
-                    }
-                    else {
-                        scanner.add(8);
-                        hash ^= word;
-                    }
+                else {
+                    scanner.add(8);
+                    hash ^= word;
                 }
             }
         }
@@ -249,8 +252,8 @@ private static Result findResult(long initialWord, long initialDelimiterMask, Sc
                 }
             }
 
-            int remainingShift = (64 - (nameLength + 1 - i) << 3);
-            if (existingResult.lastNameLong == (scanner.getLongAt(nameAddress + i) << remainingShift)) {
+            int remainingShift = (64 - ((nameLength + 1 - i) << 3));
+            if (((scanner.getLongAt(existingResult.nameAddress + i) ^ (scanner.getLongAt(nameAddress + i))) << remainingShift) == 0) {
                 break;
             }
             else {
@@ -297,7 +300,7 @@ private static void record(Result existingResult, long number) {
     }
 
     private static int hashToIndex(long hash, Result[] results) {
-        long hashAsInt = hash ^ (hash >>> 37) ^ (hash >>> 17);
+        long hashAsInt = hash ^ (hash >>> 33) ^ (hash >>> 15);
         return (int) (hashAsInt & (results.length - 1));
     }
 
@@ -324,21 +327,23 @@ private static long findDelimiter(long word) {
     private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner, List<Result> collectedResults) {
         Result r = new Result();
         results[hash] = r;
-        int i = 0;
-        for (; i < nameLength + 1 - Long.BYTES; i += Long.BYTES) {
+        int totalLength = nameLength + 1;
+        r.firstNameWord = scanner.getLongAt(nameAddress);
+        r.secondNameWord = scanner.getLongAt(nameAddress + 8);
+        if (totalLength <= 8) {
+            r.firstNameWord = r.firstNameWord & MASK1[totalLength - 1];
+            r.secondNameWord = 0;
         }
-        if (nameLength + 1 > 8) {
-            r.secondLastNameLong = scanner.getLongAt(nameAddress + i - 8);
+        else if (totalLength < 16) {
+            r.secondNameWord = r.secondNameWord & MASK1[totalLength - 9];
         }
-        int remainingShift = (64 - (nameLength + 1 - i) << 3);
-        r.lastNameLong = (scanner.getLongAt(nameAddress + i) << remainingShift);
         r.nameAddress = nameAddress;
         collectedResults.add(r);
         return r;
     }
 
     private static final class Result {
-        long lastNameLong, secondLastNameLong;
+        long firstNameWord, secondNameWord;
         short min, max;
         int count;
         long sum;

From 9e2199a5d71624e2fd252c1c0080f35897459b97 Mon Sep 17 00:00:00 2001
From: Jaromir Hamala <jaromir.hamala@gmail.com>
Date: Thu, 1 Feb 2024 11:01:18 +0100
Subject: [PATCH 245/268] nobody should try this at home (#709)

---
 .../onebrc/CalculateAverage_jerrinot.java     | 167 +++++++++++-------
 1 file changed, 104 insertions(+), 63 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
index 6997f4896..88173dac5 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java
@@ -344,7 +344,7 @@ private void transferToHeap(long fastMap) {
             }
         }
 
-        private void doOne(long cursor, long end) {
+        private void doOne(long cursor, long end, long fastMap) {
             while (cursor < end) {
                 // it seems that when pulling just from a single chunk
                 // then bit-twiddling is faster than lookup tables
@@ -361,6 +361,7 @@ private void doOne(long cursor, long end) {
 
                 long maskedFirstWord = currentWord & firstWordMask;
                 int hash = hash(maskedFirstWord);
+                int mapIndex = hash & MAP_MASK;
                 while (mask == 0) {
                     cursor += 8;
                     currentWord = UNSAFE.getLong(cursor);
@@ -371,9 +372,16 @@ private void doOne(long cursor, long end) {
                 final long maskedWord = currentWord & ((mask - 1) ^ mask) >>> 8;
 
                 int len = (int) (semicolon - start);
-                long baseEntryPtr = getOrCreateEntryBaseOffsetSlow(len, start, hash, maskedWord);
-                long temperatureWord = UNSAFE.getLong(semicolon + 1);
-                cursor = parseAndStoreTemperature(semicolon + 1, baseEntryPtr, temperatureWord);
+                if (len > 15) {
+                    long baseEntryPtr = getOrCreateEntryBaseOffsetSlow(len, start, hash, maskedWord);
+                    long temperatureWord = UNSAFE.getLong(semicolon + 1);
+                    cursor = parseAndStoreTemperature(semicolon + 1, baseEntryPtr, temperatureWord);
+                }
+                else {
+                    long baseEntryPtr = getOrCreateEntryBaseOffsetFast(mapIndex, len, maskedWord, maskedFirstWord, fastMap);
+                    long temperatureWord = UNSAFE.getLong(semicolon + 1);
+                    cursor = parseAndStoreTemperature(semicolon + 1, baseEntryPtr, temperatureWord);
+                }
             }
         }
 
@@ -415,8 +423,8 @@ public void run() {
                 }
                 setCursors(startingPtr);
                 mainLoop(fastMap);
-                doOne(cursorA, endA);
-                doOne(cursorB, endB);
+                doOne(cursorA, endA, fastMap);
+                doOne(cursorB, endB, fastMap);
             }
             transferToHeap(fastMap);
         }
@@ -454,20 +462,25 @@ private void mainLoop(long fastMap) {
                 long wordMaskA = HASH_MASKS[trailingZerosA];
                 long wordMaskB = HASH_MASKS[trailingZerosB];
 
+                long maskedMaskA = advanceMaskA & 8;
+                long maskedMaskB = advanceMaskB & 8;
+
                 long negAdvanceMaskA = ~advanceMaskA;
                 long negAdvanceMaskB = ~advanceMaskB;
 
-                cursorA += advanceMaskA & 8;
-                cursorB += advanceMaskB & 8;
+                cursorA += maskedMaskA;
+                cursorB += maskedMaskB;
 
                 long nextWordA = (advanceMaskA & candidateWordA) | (negAdvanceMaskA & currentWordA);
                 long nextWordB = (advanceMaskB & candidateWordB) | (negAdvanceMaskB & currentWordB);
 
-                long nextDelimiterMaskA = getDelimiterMask(nextWordA);
-                long nextDelimiterMaskB = getDelimiterMask(nextWordB);
+                delimiterMaskA = getDelimiterMask(nextWordA);
+                delimiterMaskB = getDelimiterMask(nextWordB);
 
-                boolean slowA = nextDelimiterMaskA == 0;
-                boolean slowB = nextDelimiterMaskB == 0;
+                boolean slowA = delimiterMaskA == 0;
+                boolean slowB = delimiterMaskB == 0;
+                trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
+                trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
                 boolean slowSome = (slowA || slowB);
 
                 long maskedFirstWordA = wordMaskA & currentWordA;
@@ -479,73 +492,101 @@ private void mainLoop(long fastMap) {
                 currentWordA = nextWordA;
                 currentWordB = nextWordB;
 
-                delimiterMaskA = nextDelimiterMaskA;
-                delimiterMaskB = nextDelimiterMaskB;
                 if (slowSome) {
-                    while (delimiterMaskA == 0) {
-                        cursorA += 8;
-                        currentWordA = UNSAFE.getLong(cursorA);
-                        delimiterMaskA = getDelimiterMask(currentWordA);
-                    }
-
-                    while (delimiterMaskB == 0) {
-                        cursorB += 8;
-                        currentWordB = UNSAFE.getLong(cursorB);
-                        delimiterMaskB = getDelimiterMask(currentWordB);
-                    }
+                    doSlow(fastMap, delimiterMaskA, currentWordA, delimiterMaskB, currentWordB, startA, startB, hashA, hashB, slowA, maskedFirstWordA, slowB,
+                            maskedFirstWordB);
                 }
+                else {
+                    final long semicolonA = cursorA + trailingZerosA;
+                    final long semicolonB = cursorB + trailingZerosB;
 
-                trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
-                trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
+                    long digitStartA = semicolonA + 1;
+                    long digitStartB = semicolonB + 1;
 
-                final long semicolonA = cursorA + trailingZerosA;
-                final long semicolonB = cursorB + trailingZerosB;
+                    long lastWordMaskA = HASH_MASKS[trailingZerosA];
+                    long lastWordMaskB = HASH_MASKS[trailingZerosB];
 
-                long digitStartA = semicolonA + 1;
-                long digitStartB = semicolonB + 1;
+                    long temperatureWordA = UNSAFE.getLong(digitStartA);
+                    long temperatureWordB = UNSAFE.getLong(digitStartB);
 
-                long lastWordMaskA = HASH_MASKS[trailingZerosA];
-                long lastWordMaskB = HASH_MASKS[trailingZerosB];
+                    final long maskedLastWordA = currentWordA & lastWordMaskA;
+                    final long maskedLastWordB = currentWordB & lastWordMaskB;
 
-                long temperatureWordA = UNSAFE.getLong(digitStartA);
-                long temperatureWordB = UNSAFE.getLong(digitStartB);
+                    int lenA = (int) (semicolonA - startA);
+                    int lenB = (int) (semicolonB - startB);
 
-                final long maskedLastWordA = currentWordA & lastWordMaskA;
-                final long maskedLastWordB = currentWordB & lastWordMaskB;
+                    int mapIndexA = hashA & MAP_MASK;
+                    int mapIndexB = hashB & MAP_MASK;
 
-                int lenA = (int) (semicolonA - startA);
-                int lenB = (int) (semicolonB - startB);
+                    long baseEntryPtrA;
+                    long baseEntryPtrB;
 
-                int mapIndexA = hashA & MAP_MASK;
-                int mapIndexB = hashB & MAP_MASK;
+                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
+                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
 
-                long baseEntryPtrA;
-                long baseEntryPtrB;
+                    cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
+                    cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
+                }
+            }
+        }
 
-                if (slowSome) {
-                    if (slowA) {
-                        baseEntryPtrA = getOrCreateEntryBaseOffsetSlow(lenA, startA, hashA, maskedLastWordA);
-                    }
-                    else {
-                        baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
-                    }
+        private void doSlow(long fastMap, long delimiterMaskA, long currentWordA, long delimiterMaskB, long currentWordB, long startA, long startB, int hashA, int hashB,
+                            boolean slowA, long maskedFirstWordA, boolean slowB, long maskedFirstWordB) {
+            int trailingZerosB;
+            int trailingZerosA;
+            while (delimiterMaskA == 0) {
+                cursorA += 8;
+                currentWordA = UNSAFE.getLong(cursorA);
+                delimiterMaskA = getDelimiterMask(currentWordA);
+            }
 
-                    if (slowB) {
-                        baseEntryPtrB = getOrCreateEntryBaseOffsetSlow(lenB, startB, hashB, maskedLastWordB);
-                    }
-                    else {
-                        baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
-                    }
+            while (delimiterMaskB == 0) {
+                cursorB += 8;
+                currentWordB = UNSAFE.getLong(cursorB);
+                delimiterMaskB = getDelimiterMask(currentWordB);
+            }
+            trailingZerosA = Long.numberOfTrailingZeros(delimiterMaskA) >> 3;
+            trailingZerosB = Long.numberOfTrailingZeros(delimiterMaskB) >> 3;
 
-                }
-                else {
-                    baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
-                    baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
-                }
+            final long semicolonA = cursorA + trailingZerosA;
+            final long semicolonB = cursorB + trailingZerosB;
+
+            long digitStartA = semicolonA + 1;
+            long digitStartB = semicolonB + 1;
+
+            long lastWordMaskA = HASH_MASKS[trailingZerosA];
+            long lastWordMaskB = HASH_MASKS[trailingZerosB];
+
+            long temperatureWordA = UNSAFE.getLong(digitStartA);
+            long temperatureWordB = UNSAFE.getLong(digitStartB);
 
-                cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
-                cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
+            final long maskedLastWordA = currentWordA & lastWordMaskA;
+            final long maskedLastWordB = currentWordB & lastWordMaskB;
+
+            int lenA = (int) (semicolonA - startA);
+            int lenB = (int) (semicolonB - startB);
+
+            int mapIndexA = hashA & MAP_MASK;
+            int mapIndexB = hashB & MAP_MASK;
+
+            long baseEntryPtrA;
+            long baseEntryPtrB;
+
+            if (slowA) {
+                baseEntryPtrA = getOrCreateEntryBaseOffsetSlow(lenA, startA, hashA, maskedLastWordA);
+            }
+            else {
+                baseEntryPtrA = getOrCreateEntryBaseOffsetFast(mapIndexA, lenA, maskedLastWordA, maskedFirstWordA, fastMap);
+            }
+
+            if (slowB) {
+                baseEntryPtrB = getOrCreateEntryBaseOffsetSlow(lenB, startB, hashB, maskedLastWordB);
+            }
+            else {
+                baseEntryPtrB = getOrCreateEntryBaseOffsetFast(mapIndexB, lenB, maskedLastWordB, maskedFirstWordB, fastMap);
             }
+            cursorA = parseAndStoreTemperature(digitStartA, baseEntryPtrA, temperatureWordA);
+            cursorB = parseAndStoreTemperature(digitStartB, baseEntryPtrB, temperatureWordB);
         }
 
         private void setCursors(long current) {

From da26f61137105532e3ac62248f6520fc4232c940 Mon Sep 17 00:00:00 2001
From: Artsiom Korzun <72259616+artsiomkorzun@users.noreply.github.com>
Date: Thu, 1 Feb 2024 11:14:31 +0100
Subject: [PATCH 246/268] handling 16 at once (#704)

---
 .../CalculateAverage_artsiomkorzun.java       | 129 +++++++++---------
 1 file changed, 68 insertions(+), 61 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
index d899c3d72..cc6e3b95a 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java
@@ -39,6 +39,8 @@ public class CalculateAverage_artsiomkorzun {
     private static final long LINE_PATTERN = 0x0A0A0A0A0A0A0A0AL;
     private static final long DOT_BITS = 0x10101000;
     private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
+    private static final long[] WORD_MASK = { 0, 0, 0, 0, 0, 0, 0, 0, -1 };
+    private static final int[] LENGTH_MASK = { 0, 0, 0, 0, 0, 0, 0, 0, -1 };
 
     private static final Unsafe UNSAFE;
 
@@ -190,12 +192,6 @@ public Aggregates() {
             UNSAFE.setMemory(pointer, SIZE, (byte) 0);
         }
 
-        public long find(long word, long hash) {
-            long address = pointer + offset(hash);
-            long w = word(address + 24);
-            return (w == word) ? address : 0;
-        }
-
         public long find(long word1, long word2, long hash) {
             long address = pointer + offset(hash);
             long w1 = word(address + 24);
@@ -393,14 +389,20 @@ public void run() {
                     long word1 = word(chunk1.position);
                     long word2 = word(chunk2.position);
                     long word3 = word(chunk3.position);
+                    long word4 = word(chunk1.position + 8);
+                    long word5 = word(chunk2.position + 8);
+                    long word6 = word(chunk3.position + 8);
 
                     long separator1 = separator(word1);
                     long separator2 = separator(word2);
                     long separator3 = separator(word3);
+                    long separator4 = separator(word4);
+                    long separator5 = separator(word5);
+                    long separator6 = separator(word6);
 
-                    long pointer1 = find(aggregates, chunk1, word1, separator1);
-                    long pointer2 = find(aggregates, chunk2, word2, separator2);
-                    long pointer3 = find(aggregates, chunk3, word3, separator3);
+                    long pointer1 = find(aggregates, chunk1, word1, word4, separator1, separator4);
+                    long pointer2 = find(aggregates, chunk2, word2, word5, separator2, separator5);
+                    long pointer3 = find(aggregates, chunk3, word3, word6, separator3, separator6);
 
                     long value1 = value(chunk1);
                     long value2 = value(chunk2);
@@ -413,26 +415,41 @@ public void run() {
 
                 while (chunk1.has()) {
                     long word1 = word(chunk1.position);
+                    long word2 = word(chunk1.position + 8);
+
                     long separator1 = separator(word1);
-                    long pointer1 = find(aggregates, chunk1, word1, separator1);
-                    long value1 = value(chunk1);
-                    Aggregates.update(pointer1, value1);
+                    long separator2 = separator(word2);
+
+                    long pointer = find(aggregates, chunk1, word1, word2, separator1, separator2);
+                    long value = value(chunk1);
+
+                    Aggregates.update(pointer, value);
                 }
 
                 while (chunk2.has()) {
-                    long word2 = word(chunk2.position);
+                    long word1 = word(chunk2.position);
+                    long word2 = word(chunk2.position + 8);
+
+                    long separator1 = separator(word1);
                     long separator2 = separator(word2);
-                    long pointer2 = find(aggregates, chunk2, word2, separator2);
-                    long value2 = value(chunk2);
-                    Aggregates.update(pointer2, value2);
+
+                    long pointer = find(aggregates, chunk2, word1, word2, separator1, separator2);
+                    long value = value(chunk2);
+
+                    Aggregates.update(pointer, value);
                 }
 
                 while (chunk3.has()) {
-                    long word3 = word(chunk3.position);
-                    long separator3 = separator(word3);
-                    long pointer3 = find(aggregates, chunk3, word3, separator3);
-                    long value3 = value(chunk3);
-                    Aggregates.update(pointer3, value3);
+                    long word1 = word(chunk3.position);
+                    long word2 = word(chunk3.position + 8);
+
+                    long separator1 = separator(word1);
+                    long separator2 = separator(word2);
+
+                    long pointer = find(aggregates, chunk3, word1, word2, separator1, separator2);
+                    long value = value(chunk3);
+
+                    Aggregates.update(pointer, value);
                 }
             }
 
@@ -456,60 +473,50 @@ private static long next(long position) {
                     continue;
                 }
 
-                return position + (Long.numberOfTrailingZeros(line) >>> 3) + 1;
+                return position + length(line) + 1;
             }
         }
 
-        private static long find(Aggregates aggregates, Chunk chunk, long word, long separator) {
+        private static long find(Aggregates aggregates, Chunk chunk, long word1, long word2, long separator1, long separator2) {
+            boolean small = (separator1 | separator2) != 0;
             long start = chunk.position;
             long hash;
+            long word;
 
-            if (separator != 0) {
-                word = mask(word, separator);
-                hash = mix(word);
+            if (small) {
+                int length1 = length(separator1);
+                int length2 = length(separator2);
+                word1 = mask(word1, separator1);
+                word2 = mask(word2 & WORD_MASK[length1], separator2);
+                hash = mix(word1 ^ word2);
 
-                chunk.position += length(separator);
-                long pointer = aggregates.find(word, hash);
+                chunk.position += length1 + (length2 & LENGTH_MASK[length1]) + 1;
+                long pointer = aggregates.find(word1, word2, hash);
 
                 if (pointer != 0) {
                     return pointer;
                 }
+
+                word = (separator1 == 0) ? word2 : word1;
             }
             else {
-                long word0 = word;
-                word = word(start + 8);
-                separator = separator(word);
-
-                if (separator != 0) {
-                    word = mask(word, separator);
-                    hash = mix(word ^ word0);
+                chunk.position += 16;
+                hash = word1 ^ word2;
 
-                    chunk.position += length(separator) + 8;
-                    long pointer = aggregates.find(word0, word, hash);
+                while (true) {
+                    word = word(chunk.position);
+                    long separator = separator(word);
 
-                    if (pointer != 0) {
-                        return pointer;
-                    }
-                }
-                else {
-                    chunk.position += 16;
-                    hash = word ^ word0;
-
-                    while (true) {
-                        word = word(chunk.position);
-                        separator = separator(word);
-
-                        if (separator == 0) {
-                            chunk.position += 8;
-                            hash ^= word;
-                            continue;
-                        }
-
-                        word = mask(word, separator);
-                        hash = mix(hash ^ word);
-                        chunk.position += length(separator);
-                        break;
+                    if (separator == 0) {
+                        chunk.position += 8;
+                        hash ^= word;
+                        continue;
                     }
+
+                    word = mask(word, separator);
+                    hash = mix(hash ^ word);
+                    chunk.position += length(separator) + 1;
+                    break;
                 }
             }
 
@@ -535,8 +542,8 @@ private static long mask(long word, long separator) {
             return word & mask;
         }
 
-        private static long length(long separator) {
-            return (Long.numberOfTrailingZeros(separator) >>> 3) + 1;
+        private static int length(long separator) {
+            return Long.numberOfTrailingZeros(separator) >>> 3;
         }
 
         private static long mix(long x) {

From 9a27939f87536abe774ac338d306dd6be3508bfb Mon Sep 17 00:00:00 2001
From: Li Lin <linl33@users.noreply.github.com>
Date: Thu, 1 Feb 2024 18:27:56 +0800
Subject: [PATCH 247/268] Add linl33 v2 (#678)

---
 .github/workflows/maven.yml                   |   7 +-
 prepare_linl33.sh                             |   3 +-
 .../onebrc/CalculateAverage_linl33.java       | 103 +++++++++---------
 3 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
index 2014739f5..859795578 100644
--- a/.github/workflows/maven.yml
+++ b/.github/workflows/maven.yml
@@ -53,8 +53,13 @@ jobs:
         id: sdkman
 
       - name: 'Build project'
+        shell: bash
         run: |
           source "$HOME/.sdkman/bin/sdkman-init.sh"
+          if [ -f ${{ format('src/main/java-22/dev/morling/onebrc/CalculateAverage_{0}.java', github.event.pull_request.user.login || '') }} ]; then
+            sdk install java 22.ea.32-open || true
+            sdk use java 22.ea.32-open
+          fi
           ./mvnw --version
           ./mvnw -B clean verify -Pci
 
@@ -63,5 +68,3 @@ jobs:
         run: |
           ./test_ci.sh ${{ github.event.pull_request.user.login }}
         if: github.event_name == 'pull_request'
-
-
diff --git a/prepare_linl33.sh b/prepare_linl33.sh
index 5fdf640a6..f943c90ef 100755
--- a/prepare_linl33.sh
+++ b/prepare_linl33.sh
@@ -17,8 +17,7 @@
 
 
 source "$HOME/.sdkman/bin/sdkman-init.sh"
-# TODO: bump to ea 32 when available
-sdk use java 22.ea.31-open 1>&2
+sdk use java 22.ea.32-open 1>&2
 
 CLASS_NAME="CalculateAverage_linl33"
 
diff --git a/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java b/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java
index 62d54546e..dc9fd23af 100644
--- a/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java
+++ b/src/main/java-22/dev/morling/onebrc/CalculateAverage_linl33.java
@@ -71,7 +71,7 @@ public static void main() throws InterruptedException, IOException {
             final var inputMapped = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size(), Arena.global());
 
             final var chunkBounds = calcChunkBounds(inputMapped.address(), inputMapped.byteSize());
-            final var maps = new SparseMap[N_THREADS];
+            final var maps = new HashTable[N_THREADS];
 
             try (final var threadPool = Executors.newFixedThreadPool(N_THREADS, THREAD_BUILDER.factory());
                     final var singleThreadExecutor = Executors.newSingleThreadExecutor(Thread.ofVirtual().factory())) {
@@ -104,11 +104,12 @@ private static long[] calcChunkBounds(final long mappedAddr, final long fileSize
         return chunkBounds;
     }
 
-    private static void printSorted(final SparseMap temperatureMeasurements) {
+    private static void printSorted(final HashTable temperatureMeasurements) {
         final var weatherStations = new AggregatedMeasurement[(int) temperatureMeasurements.size];
         final var nameBuffer = new byte[WEATHER_STATION_LENGTH_MAX];
-        var offset = temperatureMeasurements.denseAddress;
-        for (int i = 0; i < weatherStations.length; i++, offset += SparseMap.DATA_SCALE * Long.BYTES) {
+
+        for (int i = 0; i < weatherStations.length; i++) {
+            final var offset = temperatureMeasurements.getOffset(i);
             final var nameAddr = UNSAFE.getLong(offset);
             final var nameLength = UNSAFE.getInt(offset + Integer.BYTES * 7);
             MemorySegment.copy(ALL, ValueLayout.JAVA_BYTE, nameAddr, nameBuffer, 0, nameLength);
@@ -129,8 +130,8 @@ private static void printSorted(final SparseMap temperatureMeasurements) {
     }
 
     private static void printAggMeasurement(final AggregatedMeasurement aggMeasurement,
-                                            final SparseMap temperatureMeasurements) {
-        final var offset = temperatureMeasurements.denseAddress + SparseMap.DATA_SCALE * Long.BYTES * aggMeasurement.id();
+                                            final HashTable temperatureMeasurements) {
+        final var offset = temperatureMeasurements.getOffset(aggMeasurement.id());
 
         // name
         System.out.print(aggMeasurement.name());
@@ -162,15 +163,15 @@ private static double round(final double d) {
     private static class CalculateAverageTask implements Runnable {
         public static final int BATCH_SIZE_BYTES = BYTE_SPECIES.vectorByteSize();
 
-        private final SparseMap[] maps;
+        private final HashTable[] maps;
         private final long[] chunkBounds;
         private final long chunkStart;
         private final long chunkEnd;
         private final int t;
 
-        private SparseMap map;
+        private HashTable map;
 
-        public CalculateAverageTask(SparseMap[] maps, long[] chunkBounds, int t) {
+        public CalculateAverageTask(HashTable[] maps, long[] chunkBounds, int t) {
             this.maps = maps;
             this.chunkBounds = chunkBounds;
             this.chunkStart = chunkBounds[t];
@@ -180,7 +181,7 @@ public CalculateAverageTask(SparseMap[] maps, long[] chunkBounds, int t) {
 
         @Override
         public void run() {
-            this.maps[this.t] = new SparseMap();
+            this.maps[this.t] = new HashTable();
             this.map = this.maps[this.t];
 
             var lineStart = this.chunkBounds[0];
@@ -192,8 +193,8 @@ public void run() {
                 }
             }
 
-            final var vectorLimit = this.chunkStart + ((this.chunkEnd - this.chunkStart) & -BYTE_SPECIES.vectorByteSize());
-            for (long i = this.chunkStart; i < vectorLimit; i += BYTE_SPECIES.vectorByteSize()) {
+            final var vectorLimit = this.chunkStart + ((this.chunkEnd - this.chunkStart) & -BATCH_SIZE_BYTES);
+            for (long i = this.chunkStart; i < vectorLimit; i += BATCH_SIZE_BYTES) {
                 var lfMask = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, i, ByteOrder.nativeOrder())
                         .eq((byte) '\n')
                         .toLong();
@@ -272,31 +273,34 @@ private void processLine(final long lineStart, final long lfAddress) {
     /**
      * Open addressing, linear probing hash map backed by off-heap memory
      */
-    private static class SparseMap {
+    private static class HashTable {
         private static final int TRUNCATED_HASH_BITS = 26;
         // max # of unique keys
         private static final long DENSE_SIZE = WEATHER_STATION_DISTINCT_MAX;
         // max hash code (exclusive)
         private static final long SPARSE_SIZE = 1L << (TRUNCATED_HASH_BITS + 1);
-        private static final long DATA_SCALE = 4;
+        public static final long SPARSE_SCALE = 32;
+        public static final long DENSE_SCALE = 8;
 
         public final long sparseAddress;
         public final long denseAddress;
         public long size;
 
-        public SparseMap() {
+        public HashTable() {
             var arena = new MallocArena(Arena.global());
             var callocArena = new CallocArena(Arena.global());
 
-            this.size = 0L;
-
-            final var sparse = callocArena.allocate(ValueLayout.JAVA_LONG, SPARSE_SIZE);
+            final var sparse = callocArena.allocate(ValueLayout.JAVA_BYTE, SPARSE_SIZE * SPARSE_SCALE);
             this.sparseAddress = (sparse.address() + MallocArena.MAX_ALIGN) & -MallocArena.MAX_ALIGN;
 
-            final var dense = arena.allocate(ValueLayout.JAVA_LONG, DENSE_SIZE * DATA_SCALE);
+            final var dense = arena.allocate(ValueLayout.JAVA_BYTE, DENSE_SIZE * DENSE_SCALE);
             this.denseAddress = (dense.address() + MallocArena.MAX_ALIGN) & -MallocArena.MAX_ALIGN;
         }
 
+        public long getOffset(final long index) {
+            return UNSAFE.getLong(this.denseAddress + index * DENSE_SCALE);
+        }
+
         public void putEntry(final long keyAddress, final int keyLength, final int value) {
             final var hash = hash(keyAddress, keyLength);
             this.putEntryInternal(hash, keyAddress, keyLength, value, 1, value, value);
@@ -309,43 +313,46 @@ private void putEntryInternal(final long hash,
                                       final int count,
                                       final int temperatureMin,
                                       final int temperatureMax) {
-            final var sparseOffset = this.sparseAddress + truncateHash(hash) * Long.BYTES;
+            final var sparseOffset = this.sparseAddress + truncateHash(hash) * SPARSE_SCALE;
+
+            for (long n = 0, sparseLinearOffset = sparseOffset; n < WEATHER_STATION_DISTINCT_MAX; n++, sparseLinearOffset += SPARSE_SCALE) {
+                final var entryKeyAddress = UNSAFE.getLong(sparseLinearOffset);
 
-            for (long n = 0, sparseLinearOffset = sparseOffset; n < WEATHER_STATION_DISTINCT_MAX; n++, sparseLinearOffset += Long.BYTES) {
-                final var denseOffset = UNSAFE.getLong(sparseLinearOffset);
-                if (denseOffset == 0L) {
+                if (entryKeyAddress == 0L) {
                     this.add(sparseLinearOffset, keyAddress, keyLength, temperature, count, temperatureMin, temperatureMax);
                     this.size++;
                     return;
                 }
 
-                if (isCollision(keyAddress, keyLength, denseOffset)) {
+                if (mismatch(keyAddress, entryKeyAddress, keyLength)) {
                     continue;
                 }
 
-                final var currTotal = UNSAFE.getLong(denseOffset + Integer.BYTES * 2);
-                UNSAFE.putLong(denseOffset + Integer.BYTES * 2, currTotal + temperature); // total
+                final var currMin = UNSAFE.getInt(sparseLinearOffset + Integer.BYTES * 5);
+                final var currMax = UNSAFE.getInt(sparseLinearOffset + Integer.BYTES * 6);
+                final var currTotal = UNSAFE.getLong(sparseLinearOffset + Integer.BYTES * 2);
+                final var currCount = UNSAFE.getInt(sparseLinearOffset + Integer.BYTES * 4);
 
-                final var currCount = UNSAFE.getInt(denseOffset + Integer.BYTES * 4);
-                UNSAFE.putInt(denseOffset + Integer.BYTES * 4, currCount + count); // count
+                UNSAFE.putLong(sparseLinearOffset + Integer.BYTES * 2, currTotal + temperature);
+                UNSAFE.putInt(sparseLinearOffset + Integer.BYTES * 4, currCount + count);
 
-                final var currMin = UNSAFE.getInt(denseOffset + Integer.BYTES * 5);
                 if (temperatureMin < currMin) {
-                    UNSAFE.putInt(denseOffset + Integer.BYTES * 5, temperatureMin); // min
+                    UNSAFE.putInt(sparseLinearOffset + Integer.BYTES * 5, temperatureMin);
                 }
 
-                final var currMax = UNSAFE.getInt(denseOffset + Integer.BYTES * 6);
                 if (temperatureMax > currMax) {
-                    UNSAFE.putInt(denseOffset + Integer.BYTES * 6, temperatureMax); // max
+                    UNSAFE.putInt(sparseLinearOffset + Integer.BYTES * 6, temperatureMax);
                 }
 
                 return;
             }
         }
 
-        public void merge(final SparseMap other) {
+        public void merge(final HashTable other) {
             final var otherSize = other.size;
-            for (long i = 0, offset = other.denseAddress; i < otherSize; i++, offset += DATA_SCALE * Long.BYTES) {
+            for (long i = 0; i < otherSize; i++) {
+                final var offset = other.getOffset(i);
+
                 final var keyAddress = UNSAFE.getLong(offset);
                 final var keyLength = UNSAFE.getInt(offset + Integer.BYTES * 7);
                 final var hash = hash(keyAddress, keyLength);
@@ -369,22 +376,15 @@ private void add(final long sparseOffset,
                          final int temperatureMin,
                          final int temperatureMax) {
             // new entry, initialize sparse and dense
-            final var denseOffset = this.denseAddress + this.size * DATA_SCALE * Long.BYTES;
-            UNSAFE.putLong(sparseOffset, denseOffset);
-
-            UNSAFE.putLong(denseOffset, keyAddress);
-            UNSAFE.putLong(denseOffset + Integer.BYTES * 2, temperature);
-            UNSAFE.putInt(denseOffset + Integer.BYTES * 4, count);
-            UNSAFE.putInt(denseOffset + Integer.BYTES * 5, temperatureMin);
-            UNSAFE.putInt(denseOffset + Integer.BYTES * 6, temperatureMax);
-            UNSAFE.putInt(denseOffset + Integer.BYTES * 7, keyLength);
-        }
-
-        private static boolean isCollision(final long keyAddress, final int keyLength, final long denseOffset) {
-            // key length compare is unnecessary
-
-            final var entryKeyAddress = UNSAFE.getLong(denseOffset);
-            return mismatch(keyAddress, entryKeyAddress, keyLength);
+            final var denseOffset = this.denseAddress + this.size * DENSE_SCALE;
+            UNSAFE.putLong(denseOffset, sparseOffset);
+
+            UNSAFE.putLong(sparseOffset, keyAddress);
+            UNSAFE.putLong(sparseOffset + Integer.BYTES * 2, temperature);
+            UNSAFE.putInt(sparseOffset + Integer.BYTES * 4, count);
+            UNSAFE.putInt(sparseOffset + Integer.BYTES * 5, temperatureMin);
+            UNSAFE.putInt(sparseOffset + Integer.BYTES * 6, temperatureMax);
+            UNSAFE.putInt(sparseOffset + Integer.BYTES * 7, keyLength);
         }
 
         private static boolean mismatch(final long leftAddr, final long rightAddr, final int length) {
@@ -404,8 +404,7 @@ private static boolean mismatch(final long leftAddr, final long rightAddr, final
             final var r = ByteVector.fromMemorySegment(BYTE_SPECIES, ALL, rightAddr + loopBound, ByteOrder.nativeOrder());
             final var eqMask = l.eq(r).toLong();
 
-            // LE compare to add 1 to length
-            return Long.numberOfTrailingZeros(~eqMask) <= (length - loopBound);
+            return Long.numberOfTrailingZeros(~eqMask) < ((length + 1) & (BYTE_SPECIES.vectorByteSize() - 1));
             // to support platforms without TZCNT, the check can be replaced with
             // a comparison to lowestZero = ~eqMask & (eqMask + 1)
         }

From e7c92094bd1315115a38b8ddb1cec239d252f9ec Mon Sep 17 00:00:00 2001
From: Sumit Chaudhary <EduardoSaverin@users.noreply.github.com>
Date: Thu, 1 Feb 2024 16:11:13 +0530
Subject: [PATCH 248/268] EduardoSaverin (#689)

* EduardoSaverin

UserName : EduardoSaverin
Total Time : 15.408
CPU : 8 Core (Apple M1 Pro)
RAM : 16GB

* Update CalculateAverage_EduardoSaverin.java

Removed ConcurrentHashMap with Reentrant Lock + HashMap. Since multiple threads causing problems.
---
 calculate_average_EduardoSaverin.sh           |  19 ++
 prepare_EduardoSaverin.sh                     |  20 ++
 .../CalculateAverage_EduardoSaverin.java      | 319 ++++++++++++++++++
 3 files changed, 358 insertions(+)
 create mode 100755 calculate_average_EduardoSaverin.sh
 create mode 100755 prepare_EduardoSaverin.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_EduardoSaverin.java

diff --git a/calculate_average_EduardoSaverin.sh b/calculate_average_EduardoSaverin.sh
new file mode 100755
index 000000000..d94e7f77d
--- /dev/null
+++ b/calculate_average_EduardoSaverin.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_EduardoSaverin
diff --git a/prepare_EduardoSaverin.sh b/prepare_EduardoSaverin.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_EduardoSaverin.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_EduardoSaverin.java b/src/main/java/dev/morling/onebrc/CalculateAverage_EduardoSaverin.java
new file mode 100644
index 000000000..e33e4cf04
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_EduardoSaverin.java
@@ -0,0 +1,319 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.reflect.Field;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import static java.nio.file.StandardOpenOption.READ;
+
+public class CalculateAverage_EduardoSaverin {
+    private static final Path FILE = Path.of("./measurements.txt");
+    private static final int NO_OF_THREADS = Runtime.getRuntime().availableProcessors();
+    private static final Unsafe UNSAFE = initUnsafe();
+    private static final int FNV_32_OFFSET = 0x811c9dc5;
+    private static final int FNV_32_PRIME = 0x01000193;
+    private static final Map<String, ResultRow> resultRowMap = new HashMap<>();
+    private static final Lock lock = new ReentrantLock();
+
+    private static Unsafe initUnsafe() {
+        try {
+            Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
+            theUnsafe.setAccessible(true);
+            return (Unsafe) theUnsafe.get(Unsafe.class);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public record Chunk(long start, long length) {
+    }
+
+    record MapEntry(String key, ResultRow row) {
+    }
+
+    private static final class ResultRow {
+        private double min;
+        private double max;
+        private double sum;
+        private int count;
+
+        private ResultRow(double v) {
+            this.min = v;
+            this.max = v;
+            this.sum = v;
+            this.count = 1;
+        }
+
+        public String toString() {
+            return round(min) + "/" + round(sum / count) + "/" + round(max);
+        }
+
+        private double round(double value) {
+            return Math.round(value) / 10.0;
+        }
+    }
+
+    /**
+     * 0xA - Represents New Line
+     *
+     * @param fileChannel
+     * @return
+     * @throws IOException
+     */
+    static List<Chunk> getChunks(FileChannel fileChannel) throws IOException {
+        int numThreads = 1;
+        if (fileChannel.size() > 64000) {
+            numThreads = NO_OF_THREADS;
+        }
+        final long fileBytes = fileChannel.size();
+        final long chunkSize = fileBytes / numThreads;
+        final List<Chunk> chunks = new ArrayList<>(numThreads);
+        final long mappedAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileBytes, Arena.global()).address();
+        long chunkStart = 0;
+        // Ensures that the chunk size does not exceed the remaining bytes in the file.
+        long chunkLength = Math.min(fileBytes - chunkStart - 1, chunkSize);
+        while (chunkStart < fileBytes) {
+            MappedByteBuffer mappedByteBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, chunkStart + chunkLength,
+                    Math.min(Math.min(fileBytes - chunkStart - chunkLength, chunkLength), 100));
+            // Until \n found
+            while (mappedByteBuffer.get() != 0xA) {
+                chunkLength++;
+            }
+            chunks.add(new Chunk(mappedAddress + chunkStart, chunkLength + 1));
+            chunkStart += (chunkLength + 1);
+            chunkLength = Math.min(fileBytes - chunkStart - 1, chunkSize);
+        }
+        return chunks;
+    }
+
+    static class SimplerHashMap {
+        final int MAPSIZE = 65536;
+        final ResultRow[] slots = new ResultRow[MAPSIZE];
+        final byte[][] keys = new byte[MAPSIZE][];
+
+        public void putOrMerge(final byte[] key, final short length, final int hash, final int temp) {
+            int slot = hash;
+            ResultRow slotValue;
+
+            // Doing Linear Probing if Collision
+            while ((slotValue = slots[slot]) != null && (keys[slot].length != length || !unsafeEquals(keys[slot], key, length))) {
+                slot++;
+            }
+
+            // Existing Key
+            if (slotValue != null) {
+                slotValue.min = Math.min(slotValue.min, temp);
+                slotValue.max = Math.max(slotValue.max, temp);
+                slotValue.sum += temp;
+                slotValue.count++;
+                return;
+            }
+
+            // New Key
+            slots[slot] = new ResultRow(temp);
+            byte[] bytes = new byte[length];
+            System.arraycopy(key, 0, bytes, 0, length);
+            keys[slot] = bytes;
+        }
+
+        static boolean unsafeEquals(final byte[] a, final byte[] b, final short length) {
+            // byte by byte comparisons are slow, so do as big chunks as possible
+            final int baseOffset = Unsafe.ARRAY_BYTE_BASE_OFFSET;
+
+            short i = 0;
+            // Double
+            for (; i < (length & -8); i += 8) {
+                if (UNSAFE.getDouble(a, i + baseOffset) != UNSAFE.getDouble(b, i + baseOffset)) {
+                    return false;
+                }
+            }
+
+            // Long
+            for (; i < (length & -8); i += 8) {
+                if (UNSAFE.getLong(a, i + baseOffset) != UNSAFE.getLong(b, i + baseOffset)) {
+                    return false;
+                }
+            }
+            if (i == length) {
+                return true;
+            }
+            // Int
+            for (; i < (length - i & -4); i += 4) {
+                if (UNSAFE.getInt(a, i + baseOffset) != UNSAFE.getInt(b, i + baseOffset)) {
+                    return false;
+                }
+            }
+            if (i == length) {
+                return true;
+            }
+            // Short
+            for (; i < (length - i & -2); i += 2) {
+                if (UNSAFE.getShort(a, i + baseOffset) != UNSAFE.getShort(b, i + baseOffset)) {
+                    return false;
+                }
+            }
+            if (i == length) {
+                return true;
+            }
+            // Byte
+            for (; i < (length - i); i++) {
+                if (UNSAFE.getByte(a, i + baseOffset) != UNSAFE.getByte(b, i + baseOffset)) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        // Get all pairs
+        public List<MapEntry> getAll() {
+            final List<MapEntry> result = new ArrayList<>(slots.length);
+            for (int i = 0; i < slots.length; i++) {
+                ResultRow slotValue = slots[i];
+                if (slotValue != null) {
+                    result.add(new MapEntry(new String(keys[i], StandardCharsets.UTF_8), slotValue));
+                }
+            }
+            return result;
+        }
+    }
+
+    private static class Task implements Runnable {
+
+        private final SimplerHashMap results;
+        private final Chunk chunk;
+
+        public Task(Chunk chunk) {
+            this.results = new SimplerHashMap();
+            this.chunk = chunk;
+        }
+
+        @Override
+        public void run() {
+            // Max length of any city name
+            final byte[] nameBytes = new byte[100];
+            short nameIndex = 0;
+            int ot;
+            int hash = FNV_32_OFFSET;
+
+            long i = chunk.start;
+            final long cl = chunk.start + chunk.length;
+            while (i < cl) {
+                byte c;
+                // 0x3B is ;
+                while ((c = UNSAFE.getByte(i++)) != 0x3B) {
+                    nameBytes[nameIndex++] = c;
+                    // FNV-1a hash : https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
+                    hash ^= c;
+                    hash *= FNV_32_PRIME;
+                }
+
+                // Temperature just after Semicolon
+                c = UNSAFE.getByte(i++);
+                // 0x2D is Minus(-)
+                // Below you will see -48 which is used to convert from ASCII to Integer, 48 represents 0 in ASCII
+                if (c == 0x2D) {
+                    // X.X or XX.X
+                    if (UNSAFE.getByte(i + 3) == 0xA) {
+                        ot = (UNSAFE.getByte(i++) - 48) * 10;
+                    }
+                    else {
+                        ot = (UNSAFE.getByte(i++) - 48) * 100;
+                        ot += (UNSAFE.getByte(i++) - 48) * 10;
+                    }
+                    // Now dot
+                    i++; // Skipping Dot
+                    ot += (UNSAFE.getByte(i++) - 48);
+                    // Make Number Negative Since we detected (-) sign
+                    ot = -ot;
+                }
+                else {
+                    // X.X or XX.X
+                    if (UNSAFE.getByte(i + 2) == 0xA) {
+                        ot = (c - 48) * 10;
+                    }
+                    else {
+                        ot = (c - 48) * 100;
+                        ot += (UNSAFE.getByte(i++) - 48) * 10;
+                    }
+                    // Now dot
+                    i++; // Skipping Dot
+                    // Number after dot
+                    ot += (UNSAFE.getByte(i++) - 48);
+                }
+                // Since Parsed Line, Next thing must be newline
+                i++;
+                hash &= 65535;
+                results.putOrMerge(nameBytes, nameIndex, hash, ot);
+                // Reset
+                nameIndex = 0;
+                hash = FNV_32_OFFSET;
+            }
+            List<MapEntry> all = results.getAll();
+            lock.lock();
+            try {
+                for (MapEntry me : all) {
+                    ResultRow rr;
+                    ResultRow lr = me.row;
+                    if ((rr = resultRowMap.get(me.key)) != null) {
+                        rr.min = Math.min(rr.min, lr.min);
+                        rr.max = Math.max(rr.max, lr.max);
+                        rr.count += lr.count;
+                        rr.sum += lr.sum;
+                    }
+                    else {
+                        resultRowMap.put(me.key, lr);
+                    }
+                }
+            }
+            catch (Exception e) {
+                e.printStackTrace();
+            }
+            finally {
+                lock.unlock();
+            }
+        }
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        FileChannel fileChannel = FileChannel.open(FILE, READ);
+        List<Chunk> chunks = getChunks(fileChannel);
+        List<Thread> threads = new ArrayList<>();
+        for (Chunk chunk : chunks) {
+            Thread thread = new Thread(new Task(chunk));
+            thread.setPriority(Thread.MAX_PRIORITY); // Make this thread of highest priority
+            threads.add(thread);
+            thread.start();
+        }
+        for (Thread thread : threads) {
+            thread.join();
+        }
+        System.out.println(new TreeMap<>(resultRowMap));
+    }
+}

From fdd539e1f950bae64829036764c15869a00cd475 Mon Sep 17 00:00:00 2001
From: tivrfoa <lescoutinhovr@gmail.com>
Date: Thu, 1 Feb 2024 07:49:47 -0300
Subject: [PATCH 249/268] Exit earlier from loop when a new Result is created
 (#668)

* Exit earlier from loop when a new Result is created

 *   3) Make a cache of long[] name to String, to avoid `ByteBuffer.allocate`
 * and creating new UTF-8 strings. I didn't profile, so it's just a guess
 * that this map will be a bit faster. Although it's outside the main loop, so
 * not a big difference ...;
 *   4) Exit earlier from loop if a new entry was created.

* revert: Remove cache to city name

* As I was not able to make it faster... make it slower

As I was not able to make it faster ... so I'll make it slower,
because my current solution should *not* stay at the top, as it added
basically nothing.
---
 .../onebrc/CalculateAverage_tivrfoa.java      | 327 +++++++-----------
 1 file changed, 127 insertions(+), 200 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java b/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
index 54f13cbea..e6e963281 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java
@@ -38,34 +38,44 @@
  * already, and maybe even 1st place for the 10k too.
  * See: https://github.com/gunnarmorling/1brc/pull/606
  * 
- * But as I was already coding something, I'll submit just to
- * see if it will be faster than his *previous* 10k time of
- * 00:04.516
- * 
- * Changes:
- *   It's a similar idea of my previous solution, that if you split
- * the chunks evenly, some threads might finish much faster and
- * stay idle, so:
- *   1) Create more chunks than threads, so the ones that finish first
- * can do something;
- *   2) Decrease chunk sizes as we get closer to the end of the file.
+ * As I was not able to make it faster ... so I'll make it slower,
+ * because my current solution should *not* stay at the top, as it added
+ * basically nothing.
  */
 public class CalculateAverage_tivrfoa {
     private static final String FILE = "./measurements.txt";
-    private static final int MIN_TEMP = -999;
-    private static final int MAX_TEMP = 999;
+
+    private static final int MAX_CITIES = 10_000;
+    private static final int BUCKETS_LEN = 1 << 17;
+    private static final int LAST_BUCKET_ENTRY = BUCKETS_LEN - 1;
+    private static final int NUM_CPUS = Runtime.getRuntime().availableProcessors();
+    private static final AtomicInteger chunkIdx = new AtomicInteger();
+    private static long[] chunks;
+    private static int numChunks;
 
     // Holding the current result for a single city.
     private static class Result {
-        long lastNameLong, secondLastNameLong;
+        long lastNameLong;
         long[] name;
         int count;
         short min, max;
         long sum;
 
-        private Result() {
-            this.min = MAX_TEMP;
-            this.max = MIN_TEMP;
+        private Result(short number, long nameAddress, byte nameLength, Scanner scanner) {
+            this.min = number;
+            this.max = number;
+            this.sum = number;
+            this.count = 1;
+
+            name = new long[(nameLength / Long.BYTES) + 1];
+            int pos = 0, i = 0;
+            for (; i < nameLength + 1 - Long.BYTES; i += Long.BYTES) {
+                name[pos++] = scanner.getLongAt(nameAddress + i);
+            }
+
+            int remainingShift = (64 - (nameLength + 1 - i) << 3);
+            lastNameLong = (scanner.getLongAt(nameAddress + i) << remainingShift);
+            name[pos] = lastNameLong >> remainingShift;
         }
 
         public String toString() {
@@ -88,6 +98,17 @@ private void add(Result other) {
             count += other.count;
         }
 
+        private void add(short number) {
+            if (number < min) {
+                min = number;
+            }
+            if (number > max) {
+                max = number;
+            }
+            sum += number;
+            count++;
+        }
+
         public String calcName() {
             ByteBuffer bb = ByteBuffer.allocate(name.length * Long.BYTES).order(ByteOrder.nativeOrder());
             bb.asLongBuffer().put(name);
@@ -99,139 +120,95 @@ public String calcName() {
         }
     }
 
-    private static final int NUM_CPUS = Runtime.getRuntime().availableProcessors();
-    private static final AtomicInteger chunkIdx = new AtomicInteger();
-    private static long[] chunks;
-    private static int numChunks;
+    /**
+     * From:
+     * https://github.com/OpenHFT/Zero-Allocation-Hashing/blob/ea/src/main/java/net/openhft/hashing/XXH3.java
+     * 
+     * Less collisions, but it will make the code slower. xD
+     * 
+     * One interesting thing about Thomas' solution that I
+     * started to work with (d0a28599), is that it basically does not have
+     * any collision for the small data set (sometimes none!), but it
+     * has lots of collisions for the 10k, hence its poor performance.
+     * 
+     */
+    private static long XXH3_avalanche(long h64) {
+        h64 ^= h64 >>> 37;
+        h64 *= 0x165667919E3779F9L;
+        return h64 ^ (h64 >>> 32);
+    }
 
     private static final class SolveChunk extends Thread {
-        private long chunkStart, chunkEnd;
-        private Result[] results = new Result[10_000];
-        private Result[] buckets = new Result[1 << 17];
+        private int chunkStartIdx;
+        private Result[] results = new Result[MAX_CITIES];
+        private Result[] buckets = new Result[BUCKETS_LEN];
         private int resIdx = 0;
 
-        public SolveChunk(long chunkStart, long chunkEnd) {
-            this.chunkStart = chunkStart;
-            this.chunkEnd = chunkEnd;
+        public SolveChunk(int chunkStartIdx) {
+            this.chunkStartIdx = chunkStartIdx;
         }
 
         @Override
         public void run() {
-            parseLoop();
-            int chunk = chunkIdx.getAndIncrement();
-            if (chunk < numChunks) {
-                chunkStart = chunks[chunk];
-                chunkEnd = chunks[chunk + 1];
-                run();
-            }
-        }
+            for (; chunkStartIdx < numChunks; chunkStartIdx = chunkIdx.getAndIncrement()) {
+                Scanner scanner = new Scanner(chunks[chunkStartIdx], chunks[chunkStartIdx + 1]);
+                long word = scanner.getLong();
+                long pos = findDelimiter(word);
+                while (scanner.hasNext()) {
+                    long nameAddress = scanner.pos();
+                    long hash = 0;
+
+                    while (true) {
+                        if (pos != 0) {
+                            pos = Long.numberOfTrailingZeros(pos) >>> 3;
+                            scanner.add(pos);
+                            word = mask(word, pos);
+                            hash ^= XXH3_avalanche(word);
+                            break;
+                        }
+                        else {
+                            scanner.add(8);
+                            hash ^= XXH3_avalanche(word);
+                        }
 
-        private void parseLoop() {
-            Scanner scanner = new Scanner(chunkStart, chunkEnd);
-            long word = scanner.getLong();
-            long pos = findDelimiter(word);
-            while (scanner.hasNext()) {
-                long nameAddress = scanner.pos();
-                long hash = 0;
-
-                // Search for ';', one long at a time.
-                if (pos != 0) {
-                    pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                    scanner.add(pos);
-                    word = mask(word, pos);
-                    hash = word;
-
-                    int number = scanNumber(scanner);
-                    long nextWord = scanner.getLong();
-                    long nextPos = findDelimiter(nextWord);
-
-                    Result existingResult = buckets[hashToIndex(hash, buckets)];
-                    if (existingResult != null && existingResult.lastNameLong == word) {
-                        word = nextWord;
-                        pos = nextPos;
-                        record(existingResult, number);
-                        continue;
+                        word = scanner.getLong();
+                        pos = findDelimiter(word);
                     }
 
-                    scanner.setPos(nameAddress + pos);
-                }
-                else {
-                    scanner.add(8);
-                    hash = word;
-                    long prevWord = word;
-                    word = scanner.getLong();
-                    pos = findDelimiter(word);
-                    if (pos != 0) {
-                        pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                        scanner.add(pos);
-                        word = mask(word, pos);
-                        hash ^= word;
-
-                        Result existingResult = buckets[hashToIndex(hash, buckets)];
-                        if (existingResult != null && existingResult.lastNameLong == word && existingResult.secondLastNameLong == prevWord) {
-                            int number = scanNumber(scanner);
-                            word = scanner.getLong();
-                            pos = findDelimiter(word);
-                            record(existingResult, number);
-                            continue;
+                    byte nameLength = (byte) (scanner.pos() - nameAddress);
+                    short number = scanNumber(scanner);
+
+                    int tableIndex = hashToIndex(hash);
+                    outer: while (true) {
+                        Result existingResult = buckets[tableIndex];
+                        if (existingResult == null) {
+                            var newResult = new Result(number, nameAddress, nameLength, scanner);
+                            buckets[tableIndex] = newResult;
+                            results[resIdx++] = newResult;
+                            break;
                         }
-                    }
-                    else {
-                        scanner.add(8);
-                        hash ^= word;
-                        while (true) {
-                            word = scanner.getLong();
-                            pos = findDelimiter(word);
-                            if (pos != 0) {
-                                pos = Long.numberOfTrailingZeros(pos) >>> 3;
-                                scanner.add(pos);
-                                word = mask(word, pos);
-                                hash ^= word;
-                                break;
-                            }
-                            else {
-                                scanner.add(8);
-                                hash ^= word;
+                        int i = 0;
+                        int namePos = 0;
+                        for (; i < nameLength + 1 - 8; i += 8) {
+                            if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
+                                tableIndex = (tableIndex + 31) & (LAST_BUCKET_ENTRY);
+                                continue outer;
                             }
                         }
-                    }
-                }
 
-                // Save length of name for later.
-                int nameLength = (int) (scanner.pos() - nameAddress);
-                int number = scanNumber(scanner);
-
-                // Final calculation for index into hash table.
-                int tableIndex = hashToIndex(hash, buckets);
-                outer: while (true) {
-                    Result existingResult = buckets[tableIndex];
-                    if (existingResult == null) {
-                        existingResult = newEntry(buckets, nameAddress, tableIndex, nameLength, scanner);
-                        results[resIdx++] = existingResult;
-                    }
-                    // Check for collision.
-                    int i = 0;
-                    int namePos = 0;
-                    for (; i < nameLength + 1 - 8; i += 8) {
-                        if (namePos >= existingResult.name.length || existingResult.name[namePos++] != scanner.getLongAt(nameAddress + i)) {
-                            tableIndex = (tableIndex + 31) & (buckets.length - 1);
-                            continue outer;
+                        int remainingShift = (64 - (nameLength + 1 - i) << 3);
+                        if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
+                            existingResult.add(number);
+                            break;
+                        }
+                        else {
+                            tableIndex = (tableIndex + 31) & (LAST_BUCKET_ENTRY);
                         }
                     }
 
-                    int remainingShift = (64 - (nameLength + 1 - i) << 3);
-                    if (((existingResult.lastNameLong ^ (scanner.getLongAt(nameAddress + i) << remainingShift)) == 0)) {
-                        record(existingResult, number);
-                        break;
-                    }
-                    else {
-                        // Collision error, try next.
-                        tableIndex = (tableIndex + 31) & (buckets.length - 1);
-                    }
+                    word = scanner.getLong();
+                    pos = findDelimiter(word);
                 }
-
-                word = scanner.getLong();
-                pos = findDelimiter(word);
             }
         }
     }
@@ -247,77 +224,49 @@ private static void mergeIntoFinalMap(TreeMap<String, Result> map, Result[] newR
         }
     }
 
-    public static void main(String[] args) throws Exception {
-        boolean runTrick = true;
-        for (var arg : args) {
-            if (arg.equals("--worker")) {
-                runTrick = false;
-                break;
-            }
-        }
-        if (runTrick) {
-            spawnWorker();
-            return;
-        }
-
+    public static void main(String[] args) throws InterruptedException, IOException {
         chunks = getSegments(NUM_CPUS);
         numChunks = chunks.length - 1;
         final SolveChunk[] threads = new SolveChunk[NUM_CPUS];
         chunkIdx.set(NUM_CPUS);
         for (int i = 0; i < NUM_CPUS; i++) {
-            threads[i] = new SolveChunk(chunks[i], chunks[i + 1]);
+            threads[i] = new SolveChunk(i);
             threads[i].start();
         }
 
+        System.out.println(getMap(threads));
+        System.out.close();
+    }
+
+    private static TreeMap<String, Result> getMap(SolveChunk[] threads) throws InterruptedException {
         TreeMap<String, Result> map = new TreeMap<>();
-        for (int i = 0; i < NUM_CPUS; ++i) {
+        threads[0].join();
+        for (var r : threads[0].results) {
+            if (r == null)
+                break;
+            map.put(r.calcName(), r);
+        }
+        for (int i = 1; i < NUM_CPUS; ++i) {
             threads[i].join();
             mergeIntoFinalMap(map, threads[i].results);
         }
 
-        System.out.println(map);
-        System.out.close();
-    }
-
-    private static void spawnWorker() throws IOException {
-        ProcessHandle.Info info = ProcessHandle.current().info();
-        ArrayList<String> workerCommand = new ArrayList<>();
-        info.command().ifPresent(workerCommand::add);
-        info.arguments().ifPresent(args -> workerCommand.addAll(Arrays.asList(args)));
-        workerCommand.add("--worker");
-        new ProcessBuilder()
-                .command(workerCommand)
-                .inheritIO()
-                .redirectOutput(ProcessBuilder.Redirect.PIPE)
-                .start()
-                .getInputStream()
-                .transferTo(System.out);
+        return map;
     }
 
-    private static int scanNumber(Scanner scanPtr) {
+    private static short scanNumber(Scanner scanPtr) {
         scanPtr.add(1);
         long numberWord = scanPtr.getLong();
         int decimalSepPos = Long.numberOfTrailingZeros(~numberWord & 0x10101000);
         int number = convertIntoNumber(decimalSepPos, numberWord);
         scanPtr.add((decimalSepPos >>> 3) + 3);
-        return number;
+        return (short) number;
     }
 
-    private static void record(Result existingResult, int number) {
-        if (number < existingResult.min) {
-            existingResult.min = (short) number;
-        }
-        if (number > existingResult.max) {
-            existingResult.max = (short) number;
-        }
-        existingResult.sum += number;
-        existingResult.count++;
-    }
-
-    private static int hashToIndex(long hash, Result[] results) {
+    private static int hashToIndex(long hash) {
         int hashAsInt = (int) (hash ^ (hash >>> 28));
         int finalHash = (hashAsInt ^ (hashAsInt >>> 17));
-        return (finalHash & (results.length - 1));
+        return (finalHash & LAST_BUCKET_ENTRY);
     }
 
     private static long mask(long word, long pos) {
@@ -346,28 +295,6 @@ private static long findDelimiter(long word) {
         return tmp;
     }
 
-    private static Result newEntry(Result[] results, long nameAddress, int hash, int nameLength, Scanner scanner) {
-        Result r = new Result();
-        results[hash] = r;
-        long[] name = new long[(nameLength / Long.BYTES) + 1];
-        int pos = 0;
-        int i = 0;
-        for (; i < nameLength + 1 - Long.BYTES; i += Long.BYTES) {
-            name[pos++] = scanner.getLongAt(nameAddress + i);
-        }
-
-        if (pos > 0) {
-            r.secondLastNameLong = name[pos - 1];
-        }
-
-        int remainingShift = (64 - (nameLength + 1 - i) << 3);
-        long lastWord = (scanner.getLongAt(nameAddress + i) << remainingShift);
-        r.lastNameLong = lastWord;
-        name[pos] = lastWord >> remainingShift;
-        r.name = name;
-        return r;
-    }
-
     /**
      *  - Split 70% of the file in even chunks for all cpus;
      *  - Create smaller chunks for the remainder of the file.  

From 1e7314d5fb4ec948461ff1e5b49a610efbab25e6 Mon Sep 17 00:00:00 2001
From: gonix <d.giedrius+github@gmail.com>
Date: Thu, 1 Feb 2024 12:53:46 +0200
Subject: [PATCH 250/268] CalculateAverage_gonix update (#706)

Backported some of the optimizations from unsafe solution.

Co-authored-by: Giedrius D <d.giedrius@gmail.com>
---
 calculate_average_gonix.sh                    |   4 +-
 .../onebrc/CalculateAverage_gonix.java        | 509 +++++++++++-------
 2 files changed, 312 insertions(+), 201 deletions(-)

diff --git a/calculate_average_gonix.sh b/calculate_average_gonix.sh
index a6f91655f..c3f00893c 100755
--- a/calculate_average_gonix.sh
+++ b/calculate_average_gonix.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 #  Copyright 2023 The original authors
 #
@@ -17,4 +17,4 @@
 
 
 JAVA_OPTS="--enable-preview"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gonix
+exec cat < <(exec java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_gonix)
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
index 572c272ca..cbc1127ae 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java
@@ -46,6 +46,7 @@ public static void main(String[] args) throws IOException {
                         TreeMap::new));
 
         System.out.println(res);
+        System.out.close();
     }
 
     private static List<MappedByteBuffer> buildChunks(RandomAccessFile file) throws IOException {
@@ -75,248 +76,358 @@ private static List<MappedByteBuffer> buildChunks(RandomAccessFile file) throws
         }
         return chunks;
     }
-}
 
-class Aggregator {
-    private static final int MAX_STATIONS = 10_000;
-    private static final int MAX_STATION_SIZE = Math.ceilDiv(100, 8) + 5;
-    private static final int INDEX_SIZE = 1024 * 1024;
-    private static final int INDEX_MASK = INDEX_SIZE - 1;
-    private static final int FLD_COUNT = 0;
-    private static final int FLD_SUM = 1;
-    private static final int FLD_MIN = 2;
-    private static final int FLD_MAX = 3;
-
-    // Poor man's hash map: hash code to offset in `mem`.
-    private final int[] index;
-
-    // Contiguous storage of key (station name) and stats fields of all
-    // unique stations.
-    // The idea here is to improve locality so that stats fields would
-    // possibly be already in the CPU cache after we are done comparing
-    // the key.
-    private final long[] mem;
-    private int memUsed;
-
-    Aggregator() {
-        assert ((INDEX_SIZE & (INDEX_SIZE - 1)) == 0) : "INDEX_SIZE must be power of 2";
-        assert (INDEX_SIZE > MAX_STATIONS) : "INDEX_SIZE must be greater than MAX_STATIONS";
-
-        index = new int[INDEX_SIZE];
-        mem = new long[1 + (MAX_STATIONS * MAX_STATION_SIZE)];
-        memUsed = 1;
-    }
+    private static class Aggregator {
+        private static final int MAX_STATIONS = 10_000;
+        private static final int MAX_STATION_SIZE = Math.ceilDiv(100, 8) + 5;
+        private static final int INDEX_SIZE = 1024 * 1024;
+        private static final int INDEX_MASK = INDEX_SIZE - 1;
+        private static final int FLD_COUNT = 0;
+        private static final int FLD_SUM = 1;
+        private static final int FLD_MIN = 2;
+        private static final int FLD_MAX = 3;
+
+        // Poor man's hash map: hash code to offset in `mem`.
+        private final int[] index;
+
+        // Contiguous storage of key (station name) and stats fields of all
+        // unique stations.
+        // The idea here is to improve locality so that stats fields would
+        // possibly be already in the CPU cache after we are done comparing
+        // the key.
+        private final long[] mem;
+        private int memUsed;
 
-    Aggregator processChunk(MappedByteBuffer buf) {
-        // To avoid checking if it is safe to read a whole long near the
-        // end of a chunk, we copy last couple of lines to a padded buffer
-        // and process that part separately.
-        int limit = buf.limit();
-        int pos = Math.max(limit - 16, -1);
-        while (pos >= 0 && buf.get(pos) != '\n') {
-            pos--;
+        Aggregator() {
+            assert ((INDEX_SIZE & (INDEX_SIZE - 1)) == 0) : "INDEX_SIZE must be power of 2";
+            assert (INDEX_SIZE > MAX_STATIONS) : "INDEX_SIZE must be greater than MAX_STATIONS";
+
+            index = new int[INDEX_SIZE];
+            mem = new long[1 + (MAX_STATIONS * MAX_STATION_SIZE)];
+            memUsed = 1;
         }
-        pos++;
-        if (pos > 0) {
-            processChunkLongs(buf, pos);
+
+        Aggregator processChunk(MappedByteBuffer buf) {
+            // To avoid checking if it is safe to read a whole long near the
+            // end of a chunk, we copy last couple of lines to a padded buffer
+            // and process that part separately.
+            int limit = buf.limit();
+            int pos = Math.max(limit - 16, -1);
+            while (pos >= 0 && buf.get(pos) != '\n') {
+                pos--;
+            }
+            pos++;
+            if (pos > 0) {
+                processChunkLongs(buf, pos);
+            }
+            int tailLen = limit - pos;
+            var tailBuf = ByteBuffer.allocate(tailLen + 8).order(ByteOrder.nativeOrder());
+            buf.get(pos, tailBuf.array(), 0, tailLen);
+            processChunkLongs(tailBuf, tailLen);
+            return this;
         }
-        int tailLen = limit - pos;
-        var tailBuf = ByteBuffer.allocate(tailLen + 8).order(ByteOrder.nativeOrder());
-        buf.get(pos, tailBuf.array(), 0, tailLen);
-        processChunkLongs(tailBuf, tailLen);
-        return this;
-    }
 
-    Aggregator processChunkLongs(ByteBuffer buf, int limit) {
-        int pos = 0;
-        while (pos < limit) {
-
-            int start = pos;
-            int hash = 0;
-            long tail = 0;
-            while (true) {
-                // Seen this trick used in multiple other solutions.
-                // Nice breakdown here: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
-                long tmpLong = buf.getLong(pos);
-                long match = tmpLong ^ 0x3B3B3B3B_3B3B3B3BL; // 3B == ';'
-                match = ((match - 0x01010101_01010101L) & (~match & 0x80808080_80808080L));
-                if (match == 0) {
-                    hash = ((33 * hash) ^ (int) (tmpLong & 0xFFFFFFFF)) + (int) ((tmpLong >>> 33) & 0xFFFFFFFF);
-                    pos += 8;
+        Aggregator processChunkLongs(ByteBuffer buf, int limit) {
+            int pos = 0;
+            while (pos < limit) {
+
+                int start = pos;
+                long keyLong = buf.getLong(pos);
+                long valueSepMark = valueSepMark(keyLong);
+                if (valueSepMark != 0) {
+                    int tailBits = tailBits(valueSepMark);
+                    pos += valueOffset(tailBits);
+                    // assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (1), pos=" + (pos - startAddr);
+                    long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
+
+                    long valueLong = buf.getLong(pos);
+                    int decimalSepMark = decimalSepMark(valueLong);
+                    pos += nextKeyOffset(decimalSepMark);
+                    // assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (1), pos=" + (pos - startAddr);
+                    int measurement = decimalValue(decimalSepMark, valueLong);
+
+                    add1(buf, start, tailAndLen, hash(hash1(tailAndLen)), measurement);
                     continue;
                 }
 
-                int tailBits = Long.numberOfTrailingZeros(match >>> 7);
-                long tailMask = ~(-1L << tailBits);
-                tail = tmpLong & tailMask;
-                hash = ((33 * hash) ^ (int) (tail & 0xFFFFFFFF)) + (int) ((tail >>> 33) & 0xFFFFFFFF);
-                pos += tailBits >> 3;
-                break;
-            }
-            hash = (33 * hash) ^ (hash >>> 15);
-            int lenInLongs = (pos - start) >> 3;
-            long tailAndLen = (tail << 8) | (lenInLongs & 0xFF);
-            // assert (buf.get(pos) == ';') : "Expected ';'";
-            pos++;
+                pos += 8;
+                long keyLong1 = keyLong;
+                keyLong = buf.getLong(pos);
+                valueSepMark = valueSepMark(keyLong);
+                if (valueSepMark != 0) {
+                    int tailBits = tailBits(valueSepMark);
+                    pos += valueOffset(tailBits);
+                    // assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (2), pos=" + (pos - startAddr);
+                    long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
+
+                    long valueLong = buf.getLong(pos);
+                    int decimalSepMark = decimalSepMark(valueLong);
+                    pos += nextKeyOffset(decimalSepMark);
+                    // assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (2), pos=" + (pos - startAddr);
+                    int measurement = decimalValue(decimalSepMark, valueLong);
+
+                    add2(buf, start, keyLong1, tailAndLen, hash(hash(hash1(keyLong1), tailAndLen)), measurement);
+                    continue;
+                }
 
-            int measurement;
-            {
-                // Seen this trick used in multiple other solutions.
-                // Looks like the original author is @merykitty.
-                long tmpLong = buf.getLong(pos);
-
-                // The 4th binary digit of the ascii of a digit is 1 while
-                // that of the '.' is 0. This finds the decimal separator
-                // The value can be 12, 20, 28
-                int decimalSepPos = Long.numberOfTrailingZeros(~tmpLong & 0x10101000);
-                int shift = 28 - decimalSepPos;
-                // signed is -1 if negative, 0 otherwise
-                long signed = (~tmpLong << 59) >> 63;
-                long designMask = ~(signed & 0xFF);
-                // Align the number to a specific position and transform the ascii code
-                // to actual digit value in each byte
-                long digits = ((tmpLong & designMask) << shift) & 0x0F000F0F00L;
-
-                // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
-                // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
-                // 0x000000UU00TTHH00 +
-                // 0x00UU00TTHH000000 * 10 +
-                // 0xUU00TTHH00000000 * 100
-                // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
-                // This results in our value lies in the bit 32 to 41 of this product
-                // That was close :)
-                long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
-                measurement = (int) ((absValue ^ signed) - signed);
-                pos += (decimalSepPos >>> 3) + 3;
+                long hash = hash1(keyLong1);
+                do {
+                    pos += 8;
+                    hash = hash(hash, keyLong);
+                    keyLong = buf.getLong(pos);
+                    valueSepMark = valueSepMark(keyLong);
+                } while (valueSepMark == 0);
+                int tailBits = tailBits(valueSepMark);
+                pos += valueOffset(tailBits);
+                // assert (UNSAFE.getByte(pos - 1) == ';') : "Expected ';' (N), pos=" + (pos - startAddr);
+                long tailAndLen = tailAndLen(tailBits, keyLong, pos - start - 1);
+                hash = hash(hash, tailAndLen);
+
+                long valueLong = buf.getLong(pos);
+                int decimalSepMark = decimalSepMark(valueLong);
+                pos += nextKeyOffset(decimalSepMark);
+                // assert (UNSAFE.getByte(pos - 1) == '\n') : "Expected '\\n' (N), pos=" + (pos - startAddr);
+                int measurement = decimalValue(decimalSepMark, valueLong);
+
+                addN(buf, start, tailAndLen, hash(hash), measurement);
             }
-            // assert (buf.get(pos - 1) == '\n') : "Expected '\\n'";
 
-            add(buf, start, tailAndLen, hash, measurement);
+            return this;
         }
 
-        return this;
-    }
+        public Stream<Entry> stream() {
+            return Arrays.stream(index)
+                    .filter(offset -> offset != 0)
+                    .mapToObj(offset -> new Entry(mem, offset));
+        }
 
-    public Stream<Entry> stream() {
-        return Arrays.stream(index)
-                .filter(offset -> offset != 0)
-                .mapToObj(offset -> new Entry(mem, offset));
-    }
+        private static long hash1(long value) {
+            return value;
+        }
 
-    private void add(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
-        int idx = hash & INDEX_MASK;
-        for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
-            if (update(index[idx], buf, start, tailAndLen, measurement)) {
-                return;
-            }
+        private static long hash(long hash, long value) {
+            return hash ^ value;
+        }
+
+        private static int hash(long hash) {
+            hash *= 0x9E3779B97F4A7C15L; // Fibonacci hashing multiplier
+            return (int) (hash >>> 39);
         }
-        index[idx] = create(buf, start, tailAndLen, measurement);
-    }
 
-    private int create(ByteBuffer buf, int start, long tailAndLen, int measurement) {
-        int offset = memUsed;
+        private static long valueSepMark(long keyLong) {
+            // Seen this trick used in multiple other solutions.
+            // Nice breakdown here: https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+            long match = keyLong ^ 0x3B3B3B3B_3B3B3B3BL; // 3B == ';'
+            match = (match - 0x01010101_01010101L) & (~match & 0x80808080_80808080L);
+            return match;
+        }
 
-        mem[offset] = tailAndLen;
+        private static int tailBits(long valueSepMark) {
+            return Long.numberOfTrailingZeros(valueSepMark >>> 7);
+        }
 
-        int memPos = offset + 1;
-        int memEnd = memPos + (int) (tailAndLen & 0xFF);
-        int bufPos = start;
-        while (memPos < memEnd) {
-            mem[memPos] = buf.getLong(bufPos);
-            memPos += 1;
-            bufPos += 8;
+        private static int valueOffset(int tailBits) {
+            return (int) (tailBits >>> 3) + 1;
         }
 
-        mem[memPos + FLD_MIN] = measurement;
-        mem[memPos + FLD_MAX] = measurement;
-        mem[memPos + FLD_SUM] = measurement;
-        mem[memPos + FLD_COUNT] = 1;
-        memUsed = memPos + 4;
+        private static long tailAndLen(int tailBits, long keyLong, long keyLen) {
+            long tailMask = ~(-1L << tailBits);
+            long tail = keyLong & tailMask;
+            return (tail << 8) | ((keyLen >> 3) & 0xFF);
+        }
 
-        return offset;
-    }
+        private static int decimalSepMark(long value) {
+            // Seen this trick used in multiple other solutions.
+            // Looks like the original author is @merykitty.
 
-    private boolean update(int offset, ByteBuffer buf, int start, long tailAndLen, int measurement) {
-        var mem = this.mem;
-        if (mem[offset] != tailAndLen) {
-            return false;
+            // The 4th binary digit of the ascii of a digit is 1 while
+            // that of the '.' is 0. This finds the decimal separator
+            // The value can be 12, 20, 28
+            return Long.numberOfTrailingZeros(~value & 0x10101000);
         }
-        int memPos = offset + 1;
-        int memEnd = memPos + (int) (tailAndLen & 0xFF);
-        int bufPos = start;
-        while (memPos < memEnd) {
-            if (mem[memPos] != buf.getLong(bufPos)) {
-                return false;
+
+        private static int decimalValue(int decimalSepMark, long value) {
+            // Seen this trick used in multiple other solutions.
+            // Looks like the original author is @merykitty.
+
+            int shift = 28 - decimalSepMark;
+            // signed is -1 if negative, 0 otherwise
+            long signed = (~value << 59) >> 63;
+            long designMask = ~(signed & 0xFF);
+            // Align the number to a specific position and transform the ascii code
+            // to actual digit value in each byte
+            long digits = ((value & designMask) << shift) & 0x0F000F0F00L;
+
+            // Now digits is in the form 0xUU00TTHH00 (UU: units digit, TT: tens digit, HH: hundreds digit)
+            // 0xUU00TTHH00 * (100 * 0x1000000 + 10 * 0x10000 + 1) =
+            // 0x000000UU00TTHH00 +
+            // 0x00UU00TTHH000000 * 10 +
+            // 0xUU00TTHH00000000 * 100
+            // Now TT * 100 has 2 trailing zeroes and HH * 100 + TT * 10 + UU < 0x400
+            // This results in our value lies in the bit 32 to 41 of this product
+            // That was close :)
+            long absValue = ((digits * 0x640a0001) >>> 32) & 0x3FF;
+            return (int) ((absValue ^ signed) - signed);
+        }
+
+        private static int nextKeyOffset(int decimalSepMark) {
+            return (decimalSepMark >>> 3) + 3;
+        }
+
+        private void add1(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
+            int idx = hash & INDEX_MASK;
+            for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
+                if (update1(index[idx], tailAndLen, measurement)) {
+                    return;
+                }
             }
-            memPos += 1;
-            bufPos += 8;
+            index[idx] = create(buf, start, tailAndLen, measurement);
         }
 
-        mem[memPos + FLD_COUNT] += 1;
-        mem[memPos + FLD_SUM] += measurement;
-        if (measurement < mem[memPos + FLD_MIN]) {
-            mem[memPos + FLD_MIN] = measurement;
+        private void add2(ByteBuffer buf, int start, long keyLong, long tailAndLen, int hash, int measurement) {
+            int idx = hash & INDEX_MASK;
+            for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
+                if (update2(index[idx], keyLong, tailAndLen, measurement)) {
+                    return;
+                }
+            }
+            index[idx] = create(buf, start, tailAndLen, measurement);
         }
-        if (measurement > mem[memPos + FLD_MAX]) {
-            mem[memPos + FLD_MAX] = measurement;
+
+        private void addN(ByteBuffer buf, int start, long tailAndLen, int hash, int measurement) {
+            int idx = hash & INDEX_MASK;
+            for (; index[idx] != 0; idx = (idx + 1) & INDEX_MASK) {
+                if (updateN(index[idx], buf, start, tailAndLen, measurement)) {
+                    return;
+                }
+            }
+            index[idx] = create(buf, start, tailAndLen, measurement);
         }
 
-        return true;
-    }
+        private int create(ByteBuffer buf, int start, long tailAndLen, int measurement) {
+            int offset = memUsed;
 
-    public static class Entry {
-        private final long[] mem;
-        private final int offset;
-        private String key;
+            mem[offset] = tailAndLen;
 
-        Entry(long[] mem, int offset) {
-            this.mem = mem;
-            this.offset = offset;
+            int memPos = offset + 1;
+            int memEnd = memPos + (int) (tailAndLen & 0xFF);
+            int bufPos = start;
+            while (memPos < memEnd) {
+                mem[memPos] = buf.getLong(bufPos);
+                memPos += 1;
+                bufPos += 8;
+            }
+
+            mem[memPos + FLD_MIN] = measurement;
+            mem[memPos + FLD_MAX] = measurement;
+            mem[memPos + FLD_SUM] = measurement;
+            mem[memPos + FLD_COUNT] = 1;
+            memUsed = memPos + 4;
+
+            return offset;
         }
 
-        public String getKey() {
-            if (key == null) {
-                int pos = this.offset;
-                long tailAndLen = mem[pos++];
-                int keyLen = (int) (tailAndLen & 0xFF);
-                var tmpBuf = ByteBuffer.allocate((keyLen << 3) + 8).order(ByteOrder.nativeOrder());
-                for (int i = 0; i < keyLen; i++) {
-                    tmpBuf.putLong(mem[pos++]);
-                }
-                long tail = tailAndLen >>> 8;
-                tmpBuf.putLong(tail);
-                int keyLenBytes = (keyLen << 3) + 8 - (Long.numberOfLeadingZeros(tail) >> 3);
-                key = new String(tmpBuf.array(), 0, keyLenBytes, StandardCharsets.UTF_8);
+        private boolean update1(int offset, long tailAndLen, int measurement) {
+            if (mem[offset] != tailAndLen) {
+                return false;
             }
-            return key;
+            updateStats(offset + 1, measurement);
+            return true;
         }
 
-        public Entry add(Entry other) {
-            int fldOffset = (int) (mem[offset] & 0xFF) + 1;
-            int pos = offset + fldOffset;
-            int otherPos = other.offset + fldOffset;
-            long[] otherMem = other.mem;
-            mem[pos + FLD_MIN] = Math.min((int) mem[pos + FLD_MIN], (int) otherMem[otherPos + FLD_MIN]);
-            mem[pos + FLD_MAX] = Math.max((int) mem[pos + FLD_MAX], (int) otherMem[otherPos + FLD_MAX]);
-            mem[pos + FLD_SUM] += otherMem[otherPos + FLD_SUM];
-            mem[pos + FLD_COUNT] += otherMem[otherPos + FLD_COUNT];
-            return this;
+        private boolean update2(int offset, long keyLong, long tailAndLen, int measurement) {
+            if (mem[offset] != tailAndLen || mem[offset + 1] != keyLong) {
+                return false;
+            }
+            updateStats(offset + 2, measurement);
+            return true;
         }
 
-        public Entry getValue() {
-            return this;
+        private boolean updateN(int offset, ByteBuffer buf, int start, long tailAndLen, int measurement) {
+            var mem = this.mem;
+            if (mem[offset] != tailAndLen) {
+                return false;
+            }
+            int memPos = offset + 1;
+            int memEnd = memPos + (int) (tailAndLen & 0xFF);
+            int bufPos = start;
+            while (memPos < memEnd) {
+                if (mem[memPos] != buf.getLong(bufPos)) {
+                    return false;
+                }
+                memPos += 1;
+                bufPos += 8;
+            }
+            updateStats(memPos, measurement);
+            return true;
         }
 
-        @Override
-        public String toString() {
-            int pos = offset + (int) (mem[offset] & 0xFF) + 1;
-            return round(mem[pos + FLD_MIN])
-                    + "/" + round(((double) mem[pos + FLD_SUM]) / mem[pos + FLD_COUNT])
-                    + "/" + round(mem[pos + FLD_MAX]);
+        private void updateStats(int memPos, int measurement) {
+            mem[memPos + FLD_COUNT] += 1;
+            mem[memPos + FLD_SUM] += measurement;
+            if (measurement < mem[memPos + FLD_MIN]) {
+                mem[memPos + FLD_MIN] = measurement;
+            }
+            if (measurement > mem[memPos + FLD_MAX]) {
+                mem[memPos + FLD_MAX] = measurement;
+            }
         }
 
-        private static double round(double value) {
-            return Math.round(value) / 10.0;
+        public static class Entry {
+            private final long[] mem;
+            private final int offset;
+            private String key;
+
+            Entry(long[] mem, int offset) {
+                this.mem = mem;
+                this.offset = offset;
+            }
+
+            public String getKey() {
+                if (key == null) {
+                    int pos = this.offset;
+                    long tailAndLen = mem[pos++];
+                    int keyLen = (int) (tailAndLen & 0xFF);
+                    var tmpBuf = ByteBuffer.allocate((keyLen << 3) + 8).order(ByteOrder.nativeOrder());
+                    for (int i = 0; i < keyLen; i++) {
+                        tmpBuf.putLong(mem[pos++]);
+                    }
+                    long tail = tailAndLen >>> 8;
+                    tmpBuf.putLong(tail);
+                    int keyLenBytes = (keyLen << 3) + 8 - (Long.numberOfLeadingZeros(tail) >> 3);
+                    key = new String(tmpBuf.array(), 0, keyLenBytes, StandardCharsets.UTF_8);
+                }
+                return key;
+            }
+
+            public Entry add(Entry other) {
+                int fldOffset = (int) (mem[offset] & 0xFF) + 1;
+                int pos = offset + fldOffset;
+                int otherPos = other.offset + fldOffset;
+                long[] otherMem = other.mem;
+                mem[pos + FLD_MIN] = Math.min((int) mem[pos + FLD_MIN], (int) otherMem[otherPos + FLD_MIN]);
+                mem[pos + FLD_MAX] = Math.max((int) mem[pos + FLD_MAX], (int) otherMem[otherPos + FLD_MAX]);
+                mem[pos + FLD_SUM] += otherMem[otherPos + FLD_SUM];
+                mem[pos + FLD_COUNT] += otherMem[otherPos + FLD_COUNT];
+                return this;
+            }
+
+            public Entry getValue() {
+                return this;
+            }
+
+            @Override
+            public String toString() {
+                int pos = offset + (int) (mem[offset] & 0xFF) + 1;
+                return round(mem[pos + FLD_MIN])
+                        + "/" + round(((double) mem[pos + FLD_SUM]) / mem[pos + FLD_COUNT])
+                        + "/" + round(mem[pos + FLD_MAX]);
+            }
+
+            private static double round(double value) {
+                return Math.round(value) / 10.0;
+            }
         }
     }
+
 }

From 2aed039f1700bb0b8ef4543ee55c10b8235a2904 Mon Sep 17 00:00:00 2001
From: Panagiotis Drakatos <PanagiotisDrakatos@users.noreply.github.com>
Date: Thu, 1 Feb 2024 13:02:45 +0200
Subject: [PATCH 251/268] My Probably last attempt to optimize performance
 (#693)

* CalculateAverage_pdrakatos

* Rename to be valid with rules

* CalculateAverage_pdrakatos

* Rename to be valid with rules

* Changes on scripts execution

* Fixing bugs causing scripts not to be executed

* Changes on prepare make it compatible

* Fixing passing all tests

* Increase direct memory allocation buffer

* Fixing memory problem causes heap space exception

* Fresh solution to optimize performance of the execution

* New Fresh solution with optimized performance with Custom Hashtable

* Increase maxperm size and xmx to avoid heap spaces error
---
 calculate_average_PanagiotisDrakatos.sh       |   2 +-
 prepare_PanagiotisDrakatos.sh                 |   2 +-
 .../CalculateAverage_PanagiotisDrakatos.java  | 284 ++++++++++++++----
 3 files changed, 233 insertions(+), 55 deletions(-)

diff --git a/calculate_average_PanagiotisDrakatos.sh b/calculate_average_PanagiotisDrakatos.sh
index e6c936578..699ebdb28 100755
--- a/calculate_average_PanagiotisDrakatos.sh
+++ b/calculate_average_PanagiotisDrakatos.sh
@@ -32,5 +32,5 @@
 #
 source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.1-graal 1>&2
-JAVA_OPTS="--enable-preview -Xmx128m -XX:+UseSerialGC -XX:-TieredCompilation -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
+JAVA_OPTS="--enable-preview -Xms1536m  -Xmx10536m -XX:NewSize=256m -XX:MaxNewSize=512m -XX:MaxMetaspaceSize=512m -XX:+DisableExplicitGC -XX:+UseSerialGC -XX:-TieredCompilation -XX:+UnlockExperimentalVMOptions -XX:+TrustFinalNonStaticFields -dsa -XX:+UseNUMA"
 java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos
diff --git a/prepare_PanagiotisDrakatos.sh b/prepare_PanagiotisDrakatos.sh
index c322486c9..35fadfcb5 100755
--- a/prepare_PanagiotisDrakatos.sh
+++ b/prepare_PanagiotisDrakatos.sh
@@ -18,6 +18,6 @@ source "$HOME/.sdkman/bin/sdkman-init.sh"
 sdk use java 21.0.1-graal 1>&2
 
 if [ ! -f target/CalculateAverage_PanagiotisDrakatos_image ]; then
-    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -R:MaxHeapSize=64m --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos"
+    NATIVE_IMAGE_OPTS="--gc=epsilon -O3 -R:MaxHeapSize=10536m --initialize-at-build-time=dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos"
     native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_PanagiotisDrakatos_image dev.morling.onebrc.CalculateAverage_PanagiotisDrakatos
 fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java b/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
index 9ab7a2264..04633948f 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java
@@ -20,41 +20,38 @@
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
-import java.nio.charset.StandardCharsets;
 import java.util.*;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
-import java.util.stream.StreamSupport;
 
 public class CalculateAverage_PanagiotisDrakatos {
-
     private static final String FILE = "./measurements.txt";
-    private static final long SEGMENT_SIZE = 4 * 1024 * 1024;
-    private static final long COMMA_PATTERN = 0x3B3B3B3B3B3B3B3BL;
-    private static final long DOT_BITS = 0x10101000;
-    private static final long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
-
+    private static final long MAP_SIZE = 1024 * 1024 * 12L;
     private static TreeMap<String, MeasurementObject> sortedCities;
 
     public static void main(String[] args) throws IOException {
         SeekableByteRead(FILE);
-        System.out.println(sortedCities);
+        System.out.println(sortedCities.toString());
         boolean DEBUG = true;
     }
 
     private static void SeekableByteRead(String path) throws IOException {
         FileInputStream fileInputStream = new FileInputStream(new File(FILE));
         FileChannel fileChannel = fileInputStream.getChannel();
-        Optional<Map<String, MeasurementObject>> optimistic = getFileSegments(new File(FILE), fileChannel)
-                .stream()
-                .map(CalculateAverage_PanagiotisDrakatos::SplitSeekableByteChannel)
-                .parallel()
-                .map(CalculateAverage_PanagiotisDrakatos::MappingByteBufferToData)
-                .reduce(CalculateAverage_PanagiotisDrakatos::combineMaps);
+        try {
+            sortedCities = getFileSegments(new File(FILE), fileChannel).stream()
+                    .map(CalculateAverage_PanagiotisDrakatos::SplitSeekableByteChannel)
+                    .parallel()
+                    .map(CalculateAverage_PanagiotisDrakatos::MappingByteBufferToData)
+                    .flatMap(MeasurementRepository::get)
+                    .collect(Collectors.toMap(e -> e.cityName, MeasurementRepository.Entry::measurement, MeasurementObject::updateWith, TreeMap::new));
+        }
+        catch (NullPointerException e) {
+        }
         fileChannel.close();
-        sortedCities = new TreeMap<>(optimistic.orElseThrow());
-
     }
 
     record FileSegment(long start, long end, FileChannel fileChannel) {
@@ -95,14 +92,40 @@ private static long findSegment(RandomAccessFile raf, long location, final long
     private static ByteBuffer SplitSeekableByteChannel(FileSegment segment) {
         try {
             MappedByteBuffer buffer = segment.fileChannel.map(FileChannel.MapMode.READ_ONLY, segment.start(), segment.end - segment.start());
-            int end = buffer.limit() - 1;
-            while (buffer.get(end) != '\n') {
-                end--;
-            }
-            return buffer.slice(0, end);
+            return buffer;
         }
         catch (Exception ex) {
-            throw new RuntimeException(ex);
+            long start = segment.start;
+            long end = 0;
+            try {
+                end = segment.fileChannel.size();
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            MappedByteBuffer buffer = null;
+            ArrayList<ByteBuffer> list = new ArrayList<>();
+            while (start < end) {
+                try {
+                    buffer = segment.fileChannel.map(FileChannel.MapMode.READ_ONLY, start, Math.min(MAP_SIZE, end - start));
+                    // don't split the data in the middle of lines
+                    // find the closest previous newline
+                    int realEnd = buffer.limit() - 1;
+                    while (buffer.get(realEnd) != '\n')
+                        realEnd--;
+
+                    realEnd++;
+                    buffer.limit(realEnd);
+                    start += realEnd;
+                    list.add(buffer.slice(0, realEnd - 1));
+                }
+                catch (Exception e) {
+                    e.printStackTrace();
+                }
+            }
+            sortedCities = list.stream().parallel().map(CalculateAverage_PanagiotisDrakatos::MappingByteBufferToData).flatMap(MeasurementRepository::get)
+                    .collect(Collectors.toMap(e -> e.cityName, MeasurementRepository.Entry::measurement, MeasurementObject::updateWith, TreeMap::new));
+            return null;
         }
     }
 
@@ -121,38 +144,61 @@ public static ByteBuffer concat(ByteBuffer[] buffers) {
         return all;
     }
 
-    private static Map<String, MeasurementObject> combineMaps(Map<String, MeasurementObject> map1, Map<String, MeasurementObject> map2) {
-        for (var entry : map2.entrySet()) {
-            map1.merge(entry.getKey(), entry.getValue(), MeasurementObject::combine);
-        }
+    private static TreeMap<String, MeasurementObject> combineMaps(Stream<MeasurementRepository.Entry> stream1, Stream<MeasurementRepository.Entry> stream2) {
+        Stream<MeasurementRepository.Entry> resultingStream = Stream.concat(stream1, stream2);
+        return resultingStream.collect(Collectors.toMap(e -> e.cityName, MeasurementRepository.Entry::measurement, MeasurementObject::updateWith, TreeMap::new));
+    }
+
+    private static int longHashStep(final int hash, final long word) {
+        return 31 * hash + (int) (word ^ (word >>> 32));
+    }
+
+    private static final long SEPARATOR_PATTERN = compilePattern((byte) ';');
 
-        return map1;
+    private static long compilePattern(final byte value) {
+        return ((long) value << 56) | ((long) value << 48) | ((long) value << 40) | ((long) value << 32) | ((long) value << 24) | ((long) value << 16)
+                | ((long) value << 8) | (long) value;
     }
 
-    private static Map<String, MeasurementObject> MappingByteBufferToData(ByteBuffer byteBuffer) {
-        Map<String, MeasurementObject> cities = new HashMap<>();
+    private static MeasurementRepository MappingByteBufferToData(ByteBuffer byteBuffer) {
+        MeasurementRepository measurements = new MeasurementRepository();
         ByteBuffer bb = byteBuffer.duplicate();
+
         int start = 0;
-        int end = 0;
-        while (start < bb.limit()) {
-            while (bb.get(end) != ';') {
-                end++;
+        int limit = bb.limit();
+
+        long[] cityNameAsLongArray = new long[16];
+        int[] delimiterPointerAndHash = new int[2];
+
+        bb.order(ByteOrder.nativeOrder());
+        final boolean bufferIsBigEndian = bb.order().equals(ByteOrder.BIG_ENDIAN);
+
+        while ((start = bb.position()) < limit + 1) {
+
+            int delimiterPointer;
+
+            findNextDelimiterAndCalculateHash(bb, SEPARATOR_PATTERN, start, limit, delimiterPointerAndHash, cityNameAsLongArray, bufferIsBigEndian);
+            delimiterPointer = delimiterPointerAndHash[0];
+            // Simple lookup is faster for '\n' (just three options)
+            if (delimiterPointer >= limit) {
+                return measurements;
             }
+            final int cityNameLength = delimiterPointer - start;
+
             int temp_counter = 0;
-            int temp_end = end;
+            int temp_end = delimiterPointer + 1;
             try {
-                bb.position(end);
+                // bb.position(delimiterPointer++);
                 while (bb.get(temp_end) != '\n') {
                     temp_counter++;
                     temp_end++;
                 }
             }
             catch (IndexOutOfBoundsException e) {
-                temp_counter--;
-                temp_end--;
+                // temp_counter--;
+                // temp_end--;
             }
-            ByteBuffer city = bb.slice(start, end - start);
-            ByteBuffer temp = bb.slice(end + 1, temp_counter);
+            ByteBuffer temp = bb.duplicate().slice(delimiterPointer + 1, temp_counter);
             int tempPointer = 0;
             int abs = 1;
             if (temp.get(0) == '-') {
@@ -167,22 +213,141 @@ private static Map<String, MeasurementObject> MappingByteBufferToData(ByteBuffer
                 measuredValue = abs * (temp.get(tempPointer) * 100 + temp.get(tempPointer + 1) * 10 + temp.get(tempPointer + 3) - 5328);
             }
 
-            byte[] citybytes = new byte[city.limit()];
-            city.get(citybytes);
-            String cityName = new String(citybytes, StandardCharsets.UTF_8);
+            measurements.update(cityNameAsLongArray, bb, cityNameLength, delimiterPointerAndHash[1]).updateWith(measuredValue);
+
+            if (temp_end + 1 > limit)
+                return measurements;
+            bb.position(temp_end + 1);
+        }
+        return measurements;
+    }
+
+    private static void findNextDelimiterAndCalculateHash(final ByteBuffer bb, final long pattern, final int start, final int limit, final int[] output,
+                                                          final long[] asLong, final boolean bufferBigEndian) {
+        int hash = 1;
+        int i;
+        int lCnt = 0;
+        for (i = start; i <= limit - 8; i += 8) {
+            long word = bb.getLong(i);
+            if (bufferBigEndian) {
+                word = Long.reverseBytes(word); // Reversing the bytes is the cheapest way to do this
+            }
+            final long match = word ^ pattern;
+            long mask = ((match - 0x0101010101010101L) & ~match) & 0x8080808080808080L;
+
+            if (mask != 0) {
+                final int index = Long.numberOfTrailingZeros(mask) >> 3;
+                output[0] = (i + index);
 
-            // update the map with the new measurement
-            MeasurementObject agg = cities.get(cityName);
-            if (agg == null) {
-                cities.put(cityName, new MeasurementObject(measuredValue, measuredValue, 0, 0).updateWith(measuredValue));
+                final long partialHash = word & ((mask >> 7) - 1);
+                asLong[lCnt] = partialHash;
+                output[1] = longHashStep(hash, partialHash);
+                return;
             }
-            else {
-                cities.put(cityName, agg.updateWith(measuredValue));
+            asLong[lCnt++] = word;
+            hash = longHashStep(hash, word);
+        }
+        // Handle remaining bytes near the limit of the buffer:
+        long partialHash = 0;
+        int len = 0;
+        for (; i < limit; i++) {
+            byte read;
+            if ((read = bb.get(i)) == (byte) pattern) {
+                asLong[lCnt] = partialHash;
+                output[0] = i;
+                output[1] = longHashStep(hash, partialHash);
+                return;
             }
-            start = temp_end + 1;
-            end = temp_end;
+            partialHash = partialHash | ((long) read << (len << 3));
+            len++;
         }
-        return cities;
+        output[0] = limit; // delimiter not found
+    }
+
+    static class MeasurementRepository {
+        private int tableSize = 1 << 20; // can grow in theory, made large enough not to (this is faster)
+        private int tableMask = (tableSize - 1);
+        private int tableLimit = (int) (tableSize * LOAD_FACTOR);
+        private int tableFilled = 0;
+        private static final float LOAD_FACTOR = 0.8f;
+
+        private Entry[] table = new Entry[tableSize];
+
+        record Entry(int hash, long[] nameBytesInLong, String cityName, MeasurementObject measurement) {
+            @Override
+            public String toString() {
+                return cityName + "=" + measurement;
+            }
+        }
+
+        public MeasurementObject update(long[] nameBytesInLong, ByteBuffer bb, int length, int calculatedHash) {
+
+            final int nameBytesInLongLength = 1 + (length >>> 3);
+
+            int index = calculatedHash & tableMask;
+            Entry tableEntry;
+            while ((tableEntry = table[index]) != null
+                    && (tableEntry.hash != calculatedHash || !arrayEquals(tableEntry.nameBytesInLong, nameBytesInLong, nameBytesInLongLength))) { // search for the right spot
+                index = (index + 1) & tableMask;
+            }
+
+            if (tableEntry != null) {
+                return tableEntry.measurement;
+            }
+
+            // --- This is a brand new entry, insert into the hashtable and do the extra calculations (once!) do slower calculations here.
+            MeasurementObject measurement = new MeasurementObject();
+
+            // Now create a string:
+            byte[] buffer = new byte[length];
+            bb.get(buffer, 0, length);
+            String cityName = new String(buffer, 0, length);
+
+            // Store the long[] for faster equals:
+            long[] nameBytesInLongCopy = new long[nameBytesInLongLength];
+            System.arraycopy(nameBytesInLong, 0, nameBytesInLongCopy, 0, nameBytesInLongLength);
+
+            // And add entry:
+            Entry toAdd = new Entry(calculatedHash, nameBytesInLongCopy, cityName, measurement);
+            table[index] = toAdd;
+
+            // Resize the table if filled too much:
+            if (++tableFilled > tableLimit) {
+                resizeTable();
+            }
+
+            return toAdd.measurement;
+        }
+
+        private void resizeTable() {
+            // Resize the table:
+            Entry[] oldEntries = table;
+            table = new Entry[tableSize <<= 2]; // x2
+            tableMask = (tableSize - 1);
+            tableLimit = (int) (tableSize * LOAD_FACTOR);
+
+            for (Entry entry : oldEntries) {
+                if (entry != null) {
+                    int updatedTableIndex = entry.hash & tableMask;
+                    while (table[updatedTableIndex] != null) {
+                        updatedTableIndex = (updatedTableIndex + 1) & tableMask;
+                    }
+                    table[updatedTableIndex] = entry;
+                }
+            }
+        }
+
+        public Stream<Entry> get() {
+            return Arrays.stream(table).filter(Objects::nonNull);
+        }
+    }
+
+    private static boolean arrayEquals(final long[] a, final long[] b, final int length) {
+        for (int i = 0; i < length; i++) {
+            if (a[i] != b[i])
+                return false;
+        }
+        return true;
     }
 
     private static final class MeasurementObject {
@@ -202,6 +367,10 @@ public MeasurementObject(int MAX, int MIN, long SUM, int REPEAT) {
         }
 
         public MeasurementObject() {
+            this.MAX = -999;
+            this.MIN = 9999;
+            this.SUM = 0;
+            this.REPEAT = 0;
         }
 
         public MeasurementObject(int MAX, int MIN, long SUM) {
@@ -224,6 +393,15 @@ public static MeasurementObject combine(MeasurementObject m1, MeasurementObject
             return mres;
         }
 
+        public static MeasurementObject updateWith(MeasurementObject m1, MeasurementObject m2) {
+            var mres = new MeasurementObject();
+            mres.MIN = MeasurementObject.min(m1.MIN, m2.MIN);
+            mres.MAX = MeasurementObject.max(m1.MAX, m2.MAX);
+            mres.SUM = m1.SUM + m2.SUM;
+            mres.REPEAT = m1.REPEAT + m2.REPEAT;
+            return mres;
+        }
+
         public MeasurementObject updateWith(int measurement) {
             MIN = MeasurementObject.min(MIN, measurement);
             MAX = MeasurementObject.max(MAX, measurement);
@@ -268,4 +446,4 @@ public String toString() {
             return round(MIN) + "/" + round((1.0 * SUM) / REPEAT) + "/" + round(MAX);
         }
     }
-}
+}
\ No newline at end of file

From bec0cef2d3cd0c0d5d30b66bc58f351dcc912681 Mon Sep 17 00:00:00 2001
From: Diego Parra <diegolparra@gmail.com>
Date: Thu, 1 Feb 2024 08:06:28 -0300
Subject: [PATCH 252/268] dpsoft: first submission (#572)

* dpsoft: first submission

* minor clean up

* map with linear probing

* clean up

* update prepare

* clean up

* remove string format

* add credits

* fix format

* use prepare.sh

* graal 21.0.2

* fix differences

* clean up

* underflow protection

* improve segments generation logic

* clean up

* remove unnecessary alignment in findsegment

* new try

* fix number of segments
---
 calculate_average_dpsoft.sh                   |  20 ++
 prepare_dpsoft.sh                             |  20 ++
 .../onebrc/CalculateAverage_dpsoft.java       | 324 ++++++++++++++++++
 3 files changed, 364 insertions(+)
 create mode 100755 calculate_average_dpsoft.sh
 create mode 100755 prepare_dpsoft.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_dpsoft.java

diff --git a/calculate_average_dpsoft.sh b/calculate_average_dpsoft.sh
new file mode 100755
index 000000000..fd4d4634b
--- /dev/null
+++ b/calculate_average_dpsoft.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview -XX:+UnlockExperimentalVMOptions -XX:-EnableJVMCI -XX:+UseEpsilonGC -Xms128m -Xmx128m -XX:+AlwaysPreTouch -XX:+UseTransparentHugePages -XX:-TieredCompilation -XX:+TrustFinalNonStaticFields"
+
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_dpsoft
\ No newline at end of file
diff --git a/prepare_dpsoft.sh b/prepare_dpsoft.sh
new file mode 100755
index 000000000..5e6393e94
--- /dev/null
+++ b/prepare_dpsoft.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_dpsoft.java b/src/main/java/dev/morling/onebrc/CalculateAverage_dpsoft.java
new file mode 100644
index 000000000..671d8bab7
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_dpsoft.java
@@ -0,0 +1,324 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.concurrent.Phaser;
+
+public class CalculateAverage_dpsoft {
+    private static final String FILE = "./measurements.txt";
+    private static final int MAX_ROWS = 1 << 15;
+    private static final int ROWS_MASK = MAX_ROWS - 1;
+
+    public static void main(String[] args) throws IOException {
+        final var cpus = Runtime.getRuntime().availableProcessors();
+        final var segments = getMemorySegments(cpus);
+        final var tasks = new MeasurementExtractor[segments.size()];
+        final var phaser = new Phaser(segments.size());
+
+        for (int i = 0; i < segments.size(); i++) {
+            tasks[i] = new MeasurementExtractor(segments.get(i), phaser);
+        }
+
+        phaser.awaitAdvance(phaser.getPhase());
+
+        final var allMeasurements = Arrays.stream(tasks)
+                .parallel()
+                .map(MeasurementExtractor::getMeasurements)
+                .reduce(MeasurementMap::merge)
+                .orElseThrow();
+
+        System.out.println(sortSequentially(allMeasurements));
+
+        System.exit(0);
+    }
+
+    private static Map<String, Measurement> sortSequentially(MeasurementMap allMeasurements) {
+        final Map<String, Measurement> sorted = new TreeMap<>();
+        for (Measurement m : allMeasurements.measurements) {
+            if (m != null) {
+                sorted.put(new String(m.name, StandardCharsets.UTF_8), m);
+            }
+        }
+        return sorted;
+    }
+
+    // Inspired by @spullara
+    private static List<FileSegment> getMemorySegments(int numberOfSegments) throws IOException {
+        var file = new File(FILE);
+        long fileSize = file.length();
+        long segmentSize = fileSize / numberOfSegments;
+        List<FileSegment> segments = new ArrayList<>(numberOfSegments);
+
+        if (fileSize < 1_000_000) {
+            segments.add(new FileSegment(0, fileSize));
+            return segments;
+        }
+
+        while (segmentSize >= Integer.MAX_VALUE) {
+            numberOfSegments += 1;
+            segmentSize = fileSize / numberOfSegments;
+        }
+
+        try (RandomAccessFile randomAccessFile = new RandomAccessFile(file, "r")) {
+            for (int i = 0; i < numberOfSegments; i++) {
+                long segStart = i * segmentSize;
+                long segEnd = (i == numberOfSegments - 1) ? fileSize : segStart + segmentSize;
+                segStart = findSegment(i, 0, randomAccessFile, segStart, segEnd);
+                segEnd = findSegment(i, numberOfSegments - 1, randomAccessFile, segEnd, fileSize);
+
+                segments.add(new FileSegment(segStart, segEnd));
+            }
+        }
+        return segments;
+    }
+
+    private static long findSegment(int i, int skipSegment, RandomAccessFile raf, long location, long fileSize) throws IOException {
+        if (i != skipSegment) {
+            raf.seek(location);
+            while (location < fileSize) {
+                location++;
+                if (raf.read() == '\n')
+                    break;
+            }
+        }
+        return location;
+    }
+
+    record FileSegment(long start, long end) {
+    }
+
+    static final class MeasurementExtractor implements Runnable {
+        private final FileSegment segment;
+        private final Phaser phaser;
+        private final MeasurementMap measurements = new MeasurementMap();
+
+        MeasurementExtractor(FileSegment memorySegment, Phaser phaser) {
+            this.segment = memorySegment;
+            this.phaser = phaser;
+            (new Thread(this)).start();
+        }
+
+        @Override
+        public void run() {
+            long segmentEnd = segment.end();
+            try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
+                var mbb = fileChannel.map(FileChannel.MapMode.READ_ONLY, segment.start(), segmentEnd - segment.start());
+                mbb.order(ByteOrder.nativeOrder());
+
+                if (segment.start() > 0) {
+                    skipToFirstLine(mbb);
+                }
+
+                while (mbb.remaining() > 0 && mbb.position() <= segmentEnd) {
+                    int pos = mbb.position();
+                    int nameHash = hashAndRewind(mbb);
+                    var m = measurements.getOrCompute(nameHash, mbb, pos);
+                    int temp = readTemperatureFromBuffer(mbb);
+
+                    m.sample(temp);
+                }
+            }
+            catch (IOException e) {
+                throw new RuntimeException("Error reading file", e);
+            }
+            finally {
+                phaser.arriveAndAwaitAdvance();
+            }
+        }
+
+        // inspired by @lawrey
+        private static int hashAndRewind(MappedByteBuffer mbb) {
+            int hash = 0;
+            int idx = mbb.position();
+            outer: while (true) {
+                int name = mbb.getInt();
+                for (int c = 0; c < 4; c++) {
+                    int b = (name >> (c << 3)) & 0xFF;
+                    if (b == ';') {
+                        idx += c + 1;
+                        break outer;
+                    }
+                    hash ^= b * 82805;
+                }
+                idx += 4;
+            }
+
+            var rewind = mbb.position() - idx;
+            mbb.position(mbb.position() - rewind);
+            return hash;
+        }
+
+        private static int readTemperatureFromBuffer(MappedByteBuffer mbb) {
+            int temp = 0;
+            boolean negative = false;
+
+            outer: while (mbb.remaining() > 0) {
+                int b = mbb.get();
+                switch (b) {
+                    case '-':
+                        negative = true;
+                        break;
+                    default:
+                        temp = 10 * temp + (b - '0');
+                        break;
+                    case '.':
+                        b = mbb.get();
+                        temp = 10 * temp + (b - '0');
+                    case '\r':
+                        mbb.get();
+                    case '\n':
+                        break outer;
+                }
+            }
+            if (negative)
+                temp = -temp;
+            return temp;
+        }
+
+        public MeasurementMap getMeasurements() {
+            return measurements;
+        }
+
+        // Skips to the first line in the buffer, used for chunk processing.
+        private static void skipToFirstLine(MappedByteBuffer mbb) {
+            while ((mbb.get() & 0xFF) >= ' ') {
+                // Skip bytes until reaching the start of a line.
+            }
+        }
+    }
+
+    // credits to @shipilev
+    static class MeasurementMap {
+        private final Measurement[] measurements = new Measurement[MAX_ROWS];
+
+        public Measurement getOrCompute(int hash, MappedByteBuffer mbb, int position) {
+            int index = hash & ROWS_MASK;
+            var measurement = measurements[index];
+            if (measurement != null && hash == measurement.nameHash && Measurement.equalsTo(measurement.name, mbb, position)) {
+                return measurement;
+            }
+            else {
+                return compute(hash, mbb, position);
+            }
+        }
+
+        private Measurement compute(int hash, MappedByteBuffer mbb, int position) {
+            var index = hash & ROWS_MASK;
+            Measurement m;
+
+            while (true) {
+                m = measurements[index];
+                if (m == null || (hash == m.nameHash && Measurement.equalsTo(m.name, mbb, position))) {
+                    break;
+                }
+                index = (index + 1) & ROWS_MASK;
+            }
+
+            if (m == null) {
+                int len = mbb.position() - position - 1;
+                byte[] bytes = new byte[len];
+                mbb.position(position);
+                mbb.get(bytes, 0, len);
+                mbb.get();
+                measurements[index] = m = new Measurement(bytes, hash);
+            }
+
+            return m;
+        }
+
+        public MeasurementMap merge(MeasurementMap otherMap) {
+            for (Measurement other : otherMap.measurements) {
+                if (other == null)
+                    continue;
+                int index = other.nameHash & ROWS_MASK;
+                while (true) {
+                    Measurement m = measurements[index];
+                    if (m == null) {
+                        measurements[index] = other;
+                        break;
+                    }
+                    else if (Arrays.equals(m.name, other.name)) {
+                        m.merge(other);
+                        break;
+                    }
+                    else {
+                        index = (index + 1) & ROWS_MASK;
+                    }
+                }
+            }
+            return this;
+        }
+    }
+
+    static final class Measurement {
+        public final int nameHash;
+        public final byte[] name;
+
+        public long sum;
+        public int count = 0;
+        public int min = Integer.MAX_VALUE;
+        public int max = Integer.MIN_VALUE;
+
+        public Measurement(byte[] name, int nameHash) {
+            this.name = name;
+            this.nameHash = nameHash;
+        }
+
+        public static boolean equalsTo(byte[] name, MappedByteBuffer mbb, int position) {
+            int len = mbb.position() - position - 1;
+            if (len != name.length)
+                return false;
+            for (int i = 0; i < len; i++) {
+                if (name[i] != mbb.get(position + i))
+                    return false;
+            }
+            return true;
+        }
+
+        public void sample(int temp) {
+            min = Math.min(min, temp);
+            max = Math.max(max, temp);
+            sum += temp;
+            count++;
+        }
+
+        public Measurement merge(Measurement m2) {
+            min = Math.min(min, m2.min);
+            max = Math.max(max, m2.max);
+            sum += m2.sum;
+            count += m2.count;
+            return this;
+        }
+
+        public String toString() {
+            return round(((double) min) / 10.0) + "/" + round((((double) sum) / 10.0) / count) + "/" + round(((double) max) / 10.0);
+        }
+
+        private static double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+}
\ No newline at end of file

From 101993f06d1e63e3d56ab57483ff11a3349c47aa Mon Sep 17 00:00:00 2001
From: Anita SV <anitasvasu@gmail.com>
Date: Thu, 1 Feb 2024 03:15:23 -0800
Subject: [PATCH 253/268] CA_vaidhy final changes. (#708)

---
 .../onebrc/CalculateAverage_vaidhy.java       | 367 +++++++++++++-----
 1 file changed, 272 insertions(+), 95 deletions(-)

diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java b/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java
index 5795077b3..f63374a10 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java
@@ -21,6 +21,7 @@
 import java.lang.foreign.Arena;
 import java.lang.reflect.Field;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
@@ -37,69 +38,149 @@ public class CalculateAverage_vaidhy<I, T> {
 
     private static final class HashEntry {
         private long startAddress;
-        private long endAddress;
+        private long keyLength;
         private long suffix;
-        private int hash;
-
+        private int next;
         IntSummaryStatistics value;
     }
 
     private static class PrimitiveHashMap {
         private final HashEntry[] entries;
+        private final long[] hashes;
+
         private final int twoPow;
+        private int next = -1;
 
         PrimitiveHashMap(int twoPow) {
             this.twoPow = twoPow;
             this.entries = new HashEntry[1 << twoPow];
+            this.hashes = new long[1 << twoPow];
             for (int i = 0; i < entries.length; i++) {
                 this.entries[i] = new HashEntry();
             }
         }
 
-        public HashEntry find(long startAddress, long endAddress, long suffix, int hash) {
+        public IntSummaryStatistics find(long startAddress, long endAddress, long hash, long suffix) {
             int len = entries.length;
-            int i = (hash ^ (hash >> twoPow)) & (len - 1);
+            int h = Long.hashCode(hash);
+            int initialIndex = (h ^ (h >> twoPow)) & (len - 1);
+            int i = initialIndex;
+            long lookupLength = endAddress - startAddress;
 
-            do {
+            long hashEntry = hashes[i];
+
+            if (hashEntry == hash) {
                 HashEntry entry = entries[i];
-                if (entry.value == null) {
-                    return entry;
+                if (lookupLength <= 7) {
+                    // This works because
+                    // hash = suffix , when simpleHash is just xor.
+                    // Since length is not 8, suffix will have a 0 at the end.
+                    // Since utf-8 strings can't have 0 in middle of a string this means
+                    // we can stop here.
+                    return entry.value;
                 }
-                if (entry.hash == hash) {
-                    long entryLength = entry.endAddress - entry.startAddress;
-                    long lookupLength = endAddress - startAddress;
-                    if ((entryLength == lookupLength) && (entry.suffix == suffix)) {
-                        boolean found = compareEntryKeys(startAddress, endAddress, entry);
-
-                        if (found) {
-                            return entry;
-                        }
+                boolean found = (entry.suffix == suffix &&
+                        compareEntryKeys(startAddress, endAddress, entry.startAddress));
+                if (found) {
+                    return entry.value;
+                }
+            }
+
+            if (hashEntry == 0) {
+                HashEntry entry = entries[i];
+                entry.startAddress = startAddress;
+                entry.keyLength = lookupLength;
+                hashes[i] = hash;
+                entry.suffix = suffix;
+                entry.next = next;
+                this.next = i;
+                entry.value = new IntSummaryStatistics();
+                return entry.value;
+            }
+
+            i++;
+            if (i == len) {
+                i = 0;
+            }
+
+            if (i == initialIndex) {
+                return null;
+            }
+
+            do {
+                hashEntry = hashes[i];
+                if (hashEntry == hash) {
+                    HashEntry entry = entries[i];
+                    if (lookupLength <= 7) {
+                        return entry.value;
+                    }
+                    boolean found = (entry.suffix == suffix &&
+                            compareEntryKeys(startAddress, endAddress, entry.startAddress));
+                    if (found) {
+                        return entry.value;
                     }
                 }
+                if (hashEntry == 0) {
+                    HashEntry entry = entries[i];
+                    entry.startAddress = startAddress;
+                    entry.keyLength = lookupLength;
+                    hashes[i] = hash;
+                    entry.suffix = suffix;
+                    entry.next = next;
+                    this.next = i;
+                    entry.value = new IntSummaryStatistics();
+                    return entry.value;
+                }
+
                 i++;
                 if (i == len) {
                     i = 0;
                 }
-            } while (i != hash);
+            } while (i != initialIndex);
             return null;
         }
 
-        private static boolean compareEntryKeys(long startAddress, long endAddress, HashEntry entry) {
-            long entryIndex = entry.startAddress;
+        private static boolean compareEntryKeys(long startAddress, long endAddress, long entryStartAddress) {
+            long entryIndex = entryStartAddress;
             long lookupIndex = startAddress;
+            long endAddressStop = endAddress - 7;
 
-            for (; (lookupIndex + 7) < endAddress; lookupIndex += 8) {
+            for (; lookupIndex < endAddressStop; lookupIndex += 8) {
                 if (UNSAFE.getLong(entryIndex) != UNSAFE.getLong(lookupIndex)) {
                     return false;
                 }
                 entryIndex += 8;
             }
+
             return true;
         }
+
+        public Iterable<HashEntry> entrySet() {
+            return () -> new Iterator<>() {
+                int scan = next;
+
+                @Override
+                public boolean hasNext() {
+                    return scan != -1;
+                }
+
+                @Override
+                public HashEntry next() {
+                    HashEntry entry = entries[scan];
+                    scan = entry.next;
+                    return entry;
+                }
+            };
+        }
     }
 
     private static final String FILE = "./measurements.txt";
 
+    private static long simpleHash(long hash, long nextData) {
+        return hash ^ nextData;
+        // return (hash ^ Long.rotateLeft((nextData * C1), R1)) * C2;
+    }
+
     private static Unsafe initUnsafe() {
         try {
             Field theUnsafe = Unsafe.class.getDeclaredField("theUnsafe");
@@ -145,7 +226,7 @@ private static int parseDouble(long startAddress, long endAddress) {
 
     interface MapReduce<I> {
 
-        void process(long keyStartAddress, long keyEndAddress, int hash, int temperature, long suffix);
+        void process(long keyStartAddress, long keyEndAddress, long hash, long suffix, int temperature);
 
         I result();
     }
@@ -173,9 +254,13 @@ static class LineStream {
         private final long chunkEnd;
 
         private long position;
-        private int hash;
+        private long hash;
+
         private long suffix;
-        byte[] b = new byte[4];
+
+        private final ByteBuffer buf = ByteBuffer
+                .allocate(8)
+                .order(ByteOrder.LITTLE_ENDIAN);
 
         public LineStream(FileService fileService, long offset, long chunkSize) {
             long fileStart = fileService.address();
@@ -186,50 +271,38 @@ public LineStream(FileService fileService, long offset, long chunkSize) {
         }
 
         public boolean hasNext() {
-            return position <= chunkEnd && position < fileEnd;
+            return position <= chunkEnd;
         }
 
         public long findSemi() {
-            int h = 0;
-            long s = 0;
-            long i = position;
-            while ((i + 3) < fileEnd) {
-                // Adding 16 as it is the offset for primitive arrays
-                ByteBuffer.wrap(b).putInt(UNSAFE.getInt(i));
-
-                if (b[3] == 0x3B) {
-                    break;
-                }
-                i++;
-                h = ((h << 5) - h) ^ b[3];
-                s = (s << 8) ^ b[3];
+            long h = 0;
+            buf.rewind();
 
-                if (b[2] == 0x3B) {
-                    break;
+            for (long i = position; i < fileEnd; i++) {
+                byte ch = UNSAFE.getByte(i);
+                if (ch == ';') {
+                    int discard = buf.remaining();
+                    buf.rewind();
+                    long nextData = (buf.getLong() << discard) >>> discard;
+                    this.suffix = nextData;
+                    this.hash = simpleHash(h, nextData);
+                    position = i + 1;
+                    return i;
                 }
-                i++;
-                h = ((h << 5) - h) ^ b[2];
-                s = (s << 8) ^ b[2];
-
-                if (b[1] == 0x3B) {
-                    break;
+                if (buf.hasRemaining()) {
+                    buf.put(ch);
                 }
-                i++;
-                h = ((h << 5) - h) ^ b[1];
-                s = (s << 8) ^ b[1];
-
-                if (b[0] == 0x3B) {
-                    break;
+                else {
+                    buf.flip();
+                    long nextData = buf.getLong();
+                    h = simpleHash(h, nextData);
+                    buf.rewind();
                 }
-                i++;
-                h = ((h << 5) - h) ^ b[0];
-                s = (s << 8) ^ b[0];
             }
-
             this.hash = h;
-            this.suffix = s;
-            position = i + 1;
-            return i;
+            this.suffix = buf.getLong();
+            position = fileEnd;
+            return fileEnd;
         }
 
         public long skipLine() {
@@ -258,7 +331,94 @@ public long findTemperature() {
         }
     }
 
-    private void worker(long offset, long chunkSize, MapReduce<I> lineConsumer) {
+    private static final long START_BYTE_INDICATOR = 0x0101_0101_0101_0101L;
+    private static final long END_BYTE_INDICATOR = START_BYTE_INDICATOR << 7;
+
+    private static final long NEW_LINE_DETECTION = START_BYTE_INDICATOR * '\n';
+
+    private static final long SEMI_DETECTION = START_BYTE_INDICATOR * ';';
+
+    private static final long ALL_ONES = 0xffff_ffff_ffff_ffffL;
+
+    private long findByteOctet(long data, long pattern) {
+        long match = data ^ pattern;
+        return (match - START_BYTE_INDICATOR) & ((~match) & END_BYTE_INDICATOR);
+    }
+
+    private void bigWorker(long offset, long chunkSize, MapReduce<I> lineConsumer) {
+        long chunkStart = offset + fileService.address();
+        long chunkEnd = chunkStart + chunkSize;
+        long fileEnd = fileService.address() + fileService.length();
+        long stopPoint = Math.min(chunkEnd + 1, fileEnd);
+
+        boolean skip = offset != 0;
+        for (long position = chunkStart; position < stopPoint;) {
+            if (skip) {
+                long data = UNSAFE.getLong(position);
+                long newLineMask = findByteOctet(data, NEW_LINE_DETECTION);
+                if (newLineMask != 0) {
+                    int newLinePosition = Long.numberOfTrailingZeros(newLineMask) >>> 3;
+                    skip = false;
+                    position = position + newLinePosition + 1;
+                }
+                else {
+                    position = position + 8;
+                }
+                continue;
+            }
+
+            long stationStart = position;
+            long stationEnd = -1;
+            long hash = 0;
+            long suffix = 0;
+            do {
+                long data = UNSAFE.getLong(position);
+                long semiMask = findByteOctet(data, SEMI_DETECTION);
+                if (semiMask != 0) {
+                    int semiPosition = Long.numberOfTrailingZeros(semiMask) >>> 3;
+                    stationEnd = position + semiPosition;
+                    position = stationEnd + 1;
+
+                    if (semiPosition != 0) {
+                        suffix = data & (ALL_ONES >>> (64 - (semiPosition << 3)));
+                    }
+                    else {
+                        suffix = UNSAFE.getLong(position - 8);
+                    }
+                    hash = simpleHash(hash, suffix);
+                    break;
+                }
+                else {
+                    hash = simpleHash(hash, data);
+                    position = position + 8;
+                }
+            } while (true);
+
+            int temperature = 0;
+            {
+                byte ch = UNSAFE.getByte(position++);
+                boolean negative = false;
+                if (ch == '-') {
+                    negative = true;
+                    ch = UNSAFE.getByte(position++);
+                }
+                do {
+                    if (ch != '.') {
+                        temperature *= 10;
+                        temperature += (ch ^ '0');
+                    }
+                    ch = UNSAFE.getByte(position++);
+                } while (ch != '\n');
+                if (negative) {
+                    temperature = -temperature;
+                }
+            }
+
+            lineConsumer.process(stationStart, stationEnd, hash, suffix, temperature);
+        }
+    }
+
+    private void smallWorker(long offset, long chunkSize, MapReduce<I> lineConsumer) {
         LineStream lineStream = new LineStream(fileService, offset, chunkSize);
 
         if (offset != 0) {
@@ -274,29 +434,58 @@ private void worker(long offset, long chunkSize, MapReduce<I> lineConsumer) {
         while (lineStream.hasNext()) {
             long keyStartAddress = lineStream.position;
             long keyEndAddress = lineStream.findSemi();
-            long keySuffix = lineStream.suffix;
-            int keyHash = lineStream.hash;
+            long keyHash = lineStream.hash;
+            long suffix = lineStream.suffix;
             long valueStartAddress = lineStream.position;
             long valueEndAddress = lineStream.findTemperature();
             int temperature = parseDouble(valueStartAddress, valueEndAddress);
-            lineConsumer.process(keyStartAddress, keyEndAddress, keyHash, temperature, keySuffix);
+            // System.out.println("Small worker!");
+            lineConsumer.process(keyStartAddress, keyEndAddress, keyHash, suffix, temperature);
         }
     }
 
-    public T master(long chunkSize, ExecutorService executor) {
-        long len = fileService.length();
+    // file size = 7
+    // (0,0) (0,0) small chunk= (0,7)
+    // a;0.1\n
+
+    public T master(int shards, ExecutorService executor) {
         List<Future<I>> summaries = new ArrayList<>();
+        long len = fileService.length();
+
+        if (len > 128) {
+            long bigChunk = Math.floorDiv(len, shards);
+            long bigChunkReAlign = bigChunk & 0xffff_ffff_ffff_fff8L;
+
+            long smallChunkStart = bigChunkReAlign * shards;
+            long smallChunkSize = len - smallChunkStart;
+
+            for (long offset = 0; offset < smallChunkStart; offset += bigChunkReAlign) {
+                MapReduce<I> mr = chunkProcessCreator.get();
+                final long transferOffset = offset;
+                Future<I> task = executor.submit(() -> {
+                    bigWorker(transferOffset, bigChunkReAlign, mr);
+                    return mr.result();
+                });
+                summaries.add(task);
+            }
+
+            MapReduce<I> mrLast = chunkProcessCreator.get();
+            Future<I> lastTask = executor.submit(() -> {
+                smallWorker(smallChunkStart, smallChunkSize - 1, mrLast);
+                return mrLast.result();
+            });
+            summaries.add(lastTask);
+        }
+        else {
 
-        for (long offset = 0; offset < len; offset += chunkSize) {
-            long workerLength = Math.min(len, offset + chunkSize) - offset;
-            MapReduce<I> mr = chunkProcessCreator.get();
-            final long transferOffset = offset;
-            Future<I> task = executor.submit(() -> {
-                worker(transferOffset, workerLength, mr);
-                return mr.result();
+            MapReduce<I> mrLast = chunkProcessCreator.get();
+            Future<I> lastTask = executor.submit(() -> {
+                smallWorker(0, len - 1, mrLast);
+                return mrLast.result();
             });
-            summaries.add(task);
+            summaries.add(lastTask);
         }
+
         List<I> summariesDone = summaries.stream()
                 .map(task -> {
                     try {
@@ -336,22 +525,12 @@ public long address() {
     private static class ChunkProcessorImpl implements MapReduce<PrimitiveHashMap> {
 
         // 1 << 14 > 10,000 so it works
-        private final PrimitiveHashMap statistics = new PrimitiveHashMap(14);
+        private final PrimitiveHashMap statistics = new PrimitiveHashMap(15);
 
         @Override
-        public void process(long keyStartAddress, long keyEndAddress, int hash, int temperature, long suffix) {
-            HashEntry entry = statistics.find(keyStartAddress, keyEndAddress, suffix, hash);
-            if (entry == null) {
-                throw new IllegalStateException("Hash table too small :(");
-            }
-            if (entry.value == null) {
-                entry.startAddress = keyStartAddress;
-                entry.endAddress = keyEndAddress;
-                entry.suffix = suffix;
-                entry.hash = hash;
-                entry.value = new IntSummaryStatistics();
-            }
-            entry.value.accept(temperature);
+        public void process(long keyStartAddress, long keyEndAddress, long hash, long suffix, int temperature) {
+            IntSummaryStatistics stats = statistics.find(keyStartAddress, keyEndAddress, hash, suffix);
+            stats.accept(temperature);
         }
 
         @Override
@@ -368,13 +547,10 @@ public static void main(String[] args) throws IOException {
                 ChunkProcessorImpl::new,
                 CalculateAverage_vaidhy::combineOutputs);
 
-        int proc = 2 * Runtime.getRuntime().availableProcessors();
-
-        long fileSize = diskFileService.length();
-        long chunkSize = Math.ceilDiv(fileSize, proc);
+        int proc = Runtime.getRuntime().availableProcessors();
 
         ExecutorService executor = Executors.newFixedThreadPool(proc);
-        Map<String, IntSummaryStatistics> output = calculateAverageVaidhy.master(chunkSize, executor);
+        Map<String, IntSummaryStatistics> output = calculateAverageVaidhy.master(2 * proc, executor);
         executor.shutdown();
 
         Map<String, String> outputStr = toPrintMap(output);
@@ -395,11 +571,12 @@ private static Map<String, String> toPrintMap(Map<String, IntSummaryStatistics>
     private static Map<String, IntSummaryStatistics> combineOutputs(
                                                                     List<PrimitiveHashMap> list) {
 
-        Map<String, IntSummaryStatistics> output = new HashMap<>(10000);
+        Map<String, IntSummaryStatistics> output = HashMap.newHashMap(10000);
         for (PrimitiveHashMap map : list) {
-            for (HashEntry entry : map.entries) {
+            for (HashEntry entry : map.entrySet()) {
                 if (entry.value != null) {
-                    String keyStr = unsafeToString(entry.startAddress, entry.endAddress);
+                    String keyStr = unsafeToString(entry.startAddress,
+                            entry.startAddress + entry.keyLength);
 
                     output.compute(keyStr, (ignore, val) -> {
                         if (val == null) {

From 75bece5364990bc1464a91ab18d3ea27e9d9cedc Mon Sep 17 00:00:00 2001
From: yourwass <157275797+yourwass@users.noreply.github.com>
Date: Thu, 1 Feb 2024 13:25:58 +0200
Subject: [PATCH 254/268] improved 2nd and final submission (#685)

---
 calculate_average_yourwass.sh                 |   9 +-
 .../onebrc/CalculateAverage_yourwass.java     | 151 +++++++++---------
 2 files changed, 83 insertions(+), 77 deletions(-)

diff --git a/calculate_average_yourwass.sh b/calculate_average_yourwass.sh
index 07284ba76..50e31fb0b 100755
--- a/calculate_average_yourwass.sh
+++ b/calculate_average_yourwass.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 #  Copyright 2023 The original authors
 #
@@ -19,5 +19,8 @@
 # source "$HOME/.sdkman/bin/sdkman-init.sh"
 # sdk use java 21.0.1-graal 1>&2
 
-JAVA_OPTS="--enable-preview --enable-native-access=ALL-UNNAMED --add-modules jdk.incubator.vector"
-java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_yourwass
+JAVA_OPTS="-Xlog:all=off -Djdk.incubator.vector.VECTOR_ACCESS_OOB_CHECK=0 --enable-preview --enable-native-access=ALL-UNNAMED --add-modules jdk.incubator.vector"
+
+eval "exec 3< <({ java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_yourwass; })"
+read <&3 result
+echo -e "$result"
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java b/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java
index 0a24b0a7e..ad57b5004 100644
--- a/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java
@@ -16,6 +16,8 @@
 package dev.morling.onebrc;
 
 import java.util.TreeMap;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
 import java.io.IOException;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
@@ -31,18 +33,15 @@
 import sun.misc.Unsafe;
 
 public class CalculateAverage_yourwass {
-
     static final class Record {
-        public String city;
-        public long cityAddr;
-        public long cityLength;
-        public int min;
-        public int max;
-        public int count;
-        public long sum;
+        private long cityAddr;
+        private long cityLength;
+        private int min;
+        private int max;
+        private int count;
+        private long sum;
 
         Record(final long cityAddr, final long cityLength) {
-            this.city = null;
             this.cityAddr = cityAddr;
             this.cityLength = cityLength;
             this.min = 1000;
@@ -62,6 +61,8 @@ private Record merge(Record r) {
         }
     }
 
+    private final static Lock _mutex = new ReentrantLock(true);
+    private final static TreeMap<String, Record> aggregateResults = new TreeMap<>();
     private static short lookupDecimal[];
     private static byte lookupFraction[];
     private static byte lookupDotPositive[];
@@ -70,6 +71,8 @@ private Record merge(Record r) {
     private static final VectorSpecies<Byte> SPECIES = ByteVector.SPECIES_PREFERRED;
     private static final int MAXINDEX = (1 << 16) + 10000; // short hash + max allowed cities for collisions at the end :p
     private static final String FILE = "measurements.txt";
+    private static long unsafeResults;
+    private static int RECORDSIZE = 36;
     private static final Unsafe UNSAFE = getUnsafe();
 
     private static Unsafe getUnsafe() {
@@ -113,11 +116,9 @@ public static void main(String[] args) throws IOException, Throwable {
         }
 
         // open file
-        final long fileSize, mmapAddr;
-        try (var fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ)) {
-            fileSize = fileChannel.size();
-            mmapAddr = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
-        }
+        final FileChannel fileChannel = FileChannel.open(Path.of(FILE), StandardOpenOption.READ);
+        final long fileSize = fileChannel.size();
+        final long mmapAddr = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
         // VAS: Virtual Address Space, as a MemorySegment upto and including the mmaped file.
         // If the mmaped MemorySegment is used for Vector creation as is, then there are two problems:
         // 1) fromMemorySegment takes an offset and not an address, so we have to do arithmetic
@@ -127,36 +128,24 @@ public static void main(String[] args) throws IOException, Throwable {
         // XXX there lies the possibility for an out of bounds read at the end of file, which is not handled here.
         VAS = MemorySegment.ofAddress(0).reinterpret(mmapAddr + fileSize + SPECIES.length());
 
-        // start and wait for threads to finish
+        // allocate memory for results
         final int nThreads = Runtime.getRuntime().availableProcessors();
+        unsafeResults = UNSAFE.allocateMemory(RECORDSIZE * MAXINDEX * nThreads);
+        UNSAFE.setMemory(unsafeResults, RECORDSIZE * MAXINDEX * nThreads, (byte) 0);
+
+        // start and wait for threads to finish
         Thread[] threadList = new Thread[nThreads];
-        final Record[][] results = new Record[nThreads][];
         final long chunkSize = fileSize / nThreads;
         for (int i = 0; i < nThreads; i++) {
             final int threadIndex = i;
             final long startAddr = mmapAddr + i * chunkSize;
             final long endAddr = (i == nThreads - 1) ? mmapAddr + fileSize : mmapAddr + (i + 1) * chunkSize;
-            threadList[i] = new Thread(() -> results[threadIndex] = threadMain(threadIndex, startAddr, endAddr, nThreads));
+            threadList[i] = new Thread(() -> threadMain(threadIndex, startAddr, endAddr, nThreads));
             threadList[i].start();
         }
         for (int i = 0; i < nThreads; i++)
             threadList[i].join();
 
-        // aggregate results and sort
-        // TODO have to compare with concurrent-parallel stream structures:
-        // * concurrent hashtable that have to sort afterwards
-        // * concurrent skiplist that is sorted but has O(n) insert
-        // * ..other?
-        final TreeMap<String, Record> aggregateResults = new TreeMap<>();
-        for (int thread = 0; thread < nThreads; thread++) {
-            for (int index = 0; index < MAXINDEX; index++) {
-                Record record = results[thread][index];
-                if (record == null)
-                    continue;
-                aggregateResults.compute(record.city, (k, v) -> (v == null) ? record : v.merge(record));
-            }
-        }
-
         // prepare string and print
         StringBuilder sb = new StringBuilder();
         sb.append("{");
@@ -167,12 +156,13 @@ public static void main(String[] args) throws IOException, Throwable {
             float max = record.max;
             max /= 10.f;
             double avg = Math.round((record.sum * 1.0) / record.count) / 10.;
-            sb.append(record.city).append("=").append(min).append("/").append(avg).append("/").append(max).append(", ");
+            sb.append(entry.getKey()).append("=").append(min).append("/").append(avg).append("/").append(max).append(", ");
         }
         int stringLength = sb.length();
         sb.setCharAt(stringLength - 2, '}');
         sb.setCharAt(stringLength - 1, '\n');
         System.out.print(sb.toString());
+        System.out.close();
     }
 
     private static final boolean citiesDiffer(final long a, final long b, final long len) {
@@ -185,7 +175,7 @@ private static final boolean citiesDiffer(final long a, final long b, final long
         return false;
     }
 
-    private static Record[] threadMain(int id, long startAddr, long endAddr, long nThreads) {
+    private static void threadMain(int id, long startAddr, long endAddr, long nThreads) {
         // snap to newlines
         if (id != 0)
             while (UNSAFE.getByte(startAddr++) != '\n')
@@ -194,23 +184,24 @@ private static Record[] threadMain(int id, long startAddr, long endAddr, long nT
             while (UNSAFE.getByte(endAddr++) != '\n')
                 ;
 
+        final long threadResults = unsafeResults + id * MAXINDEX * RECORDSIZE;
         final Record[] results = new Record[MAXINDEX];
         final long VECTORBYTESIZE = SPECIES.length();
         final ByteOrder BYTEORDER = ByteOrder.nativeOrder();
         final ByteVector delim = ByteVector.broadcast(SPECIES, ';');
-        long nextCityAddr = startAddr; // XXX from these three variables,
-        long cityAddr = nextCityAddr; // only two are necessary, but if one
-        long ptr = 0; // is eliminated, on my pc the benchmark gets worse..
-        while (nextCityAddr < endAddr) {
+        long cityAddr = startAddr;
+        long ptr = 0;
+        while (cityAddr < endAddr) {
             // parse city
-            long mask = ByteVector.fromMemorySegment(SPECIES, VAS, nextCityAddr + ptr, BYTEORDER)
-                    .compare(VectorOperators.EQ, delim).toLong();
-            if (mask == 0) {
+            ByteVector parsed = ByteVector.fromMemorySegment(SPECIES, VAS, cityAddr, BYTEORDER);
+            long mask = parsed.compare(VectorOperators.EQ, delim).toLong();
+            while (mask == 0) {
                 ptr += VECTORBYTESIZE;
-                continue;
+                mask = ByteVector.fromMemorySegment(SPECIES, VAS, cityAddr + ptr, BYTEORDER).compare(VectorOperators.EQ, delim).toLong();
             }
             final long cityLength = ptr + Long.numberOfTrailingZeros(mask);
             final long tempAddr = cityAddr + cityLength + 1;
+            ptr = 0;
 
             // compute hash table index
             int index;
@@ -222,67 +213,79 @@ private static Record[] threadMain(int id, long startAddr, long endAddr, long nT
                         & 0xFFFF;
             else
                 index = (UNSAFE.getByte(cityAddr) << 8) & 0xFF00;
-
             // resolve collisions with linear probing
             // use vector api here also, but only if city name fits in one vector length, for faster default case
-            Record record = results[index];
+            long record = threadResults + index * RECORDSIZE;
+            long recordCityLength = UNSAFE.getLong(record);
             if (cityLength <= VECTORBYTESIZE) {
-                ByteVector parsed = ByteVector.fromMemorySegment(SPECIES, VAS, cityAddr, BYTEORDER);
-                while (record != null) {
-                    if (cityLength == record.cityLength) {
-                        long sameMask = ByteVector.fromMemorySegment(SPECIES, VAS, record.cityAddr, BYTEORDER)
+                while (recordCityLength > 0) {
+                    if (cityLength == recordCityLength) {
+                        long sameMask = ByteVector.fromMemorySegment(SPECIES, VAS, UNSAFE.getLong(record + 8), BYTEORDER)
                                 .compare(VectorOperators.EQ, parsed).toLong();
                         if (Long.numberOfTrailingZeros(~sameMask) >= cityLength)
                             break;
                     }
-                    record = results[++index];
+                    index++;
+                    record = threadResults + index * RECORDSIZE;
+                    recordCityLength = UNSAFE.getLong(record);
                 }
             }
             else { // slower normal case for city names with length > VECTORBYTESIZE
-                while (record != null && (cityLength != record.cityLength || citiesDiffer(record.cityAddr, cityAddr, cityLength)))
-                    record = results[++index];
+                while (recordCityLength > 0 && (cityLength != recordCityLength || citiesDiffer(UNSAFE.getLong(record + 8), cityAddr, cityLength))) {
+                    index++;
+                    record = threadResults + index * RECORDSIZE;
+                    recordCityLength = UNSAFE.getLong(record);
+                }
             }
 
-            // add record for new keys
-            // TODO have to avoid memory allocations on hot path
-            if (record == null) {
-                results[index] = new Record(cityAddr, cityLength);
-                record = results[index];
+            // add record for new key
+            if (recordCityLength == 0) {
+                UNSAFE.putLong(record, cityLength);
+                UNSAFE.putLong(record + 8, cityAddr);
+                UNSAFE.putInt(record + 16, 1000);
+                UNSAFE.putInt(record + 20, -1000);
             }
 
             // parse temp with lookup tables
             int temp;
             if (UNSAFE.getByte(tempAddr) == '-') {
                 temp = -lookupDecimal[UNSAFE.getShort(tempAddr + 1)] - lookupFraction[UNSAFE.getShort(tempAddr + 3)];
-                nextCityAddr = tempAddr + lookupDotNegative[UNSAFE.getShort(tempAddr + 3)];
+                cityAddr = tempAddr + lookupDotNegative[UNSAFE.getShort(tempAddr + 3)];
             }
             else {
                 temp = lookupDecimal[UNSAFE.getShort(tempAddr)] + lookupFraction[UNSAFE.getShort(tempAddr + 2)];
-                nextCityAddr = tempAddr + lookupDotPositive[UNSAFE.getShort(tempAddr + 2)];
+                cityAddr = tempAddr + lookupDotPositive[UNSAFE.getShort(tempAddr + 2)];
             }
-            cityAddr = nextCityAddr;
-            ptr = 0;
 
-            // merge record
-            if (temp < record.min)
-                record.min = temp;
-            if (temp > record.max)
-                record.max = temp;
-            record.sum += temp;
-            record.count += 1;
+            // merge
+            if (temp < UNSAFE.getInt(record + 16))
+                UNSAFE.putInt(record + 16, temp);
+            if (temp > UNSAFE.getInt(record + 20))
+                UNSAFE.putInt(record + 20, temp);
+            UNSAFE.putLong(record + 24, UNSAFE.getLong(record + 24) + temp);
+            UNSAFE.putInt(record + 32, UNSAFE.getInt(record + 32) + 1);
         }
 
         // create strings from raw data
-        // TODO should avoid this copy
+        // and aggregate results onto TreeMap
+        int idx = 0;
         byte b[] = new byte[100];
+        _mutex.lock();
         for (int i = 0; i < MAXINDEX; i++) {
-            Record r = results[i];
-            if (r == null)
+            if (UNSAFE.getLong(threadResults + i * RECORDSIZE) == 0)
                 continue;
-            UNSAFE.copyMemory(null, r.cityAddr, b, Unsafe.ARRAY_BYTE_BASE_OFFSET, r.cityLength);
-            r.city = new String(b, 0, (int) r.cityLength, StandardCharsets.UTF_8);
+            final long recordAddress = threadResults + i * RECORDSIZE;
+
+            results[idx] = new Record(UNSAFE.getLong(recordAddress + 8), UNSAFE.getLong(recordAddress));
+            results[idx].min = UNSAFE.getInt(recordAddress + 16);
+            results[idx].max = UNSAFE.getInt(recordAddress + 20);
+            results[idx].sum = UNSAFE.getLong(recordAddress + 24);
+            results[idx].count = UNSAFE.getInt(recordAddress + 32);
+            UNSAFE.copyMemory(null, UNSAFE.getLong(recordAddress + 8), b, Unsafe.ARRAY_BYTE_BASE_OFFSET, UNSAFE.getLong(recordAddress));
+            final Record record = results[idx];
+            aggregateResults.compute(new String(b, 0, (int) results[idx].cityLength, StandardCharsets.UTF_8), (k, v) -> (v == null) ? record : v.merge(record));
+            idx++;
         }
-        return results;
+        _mutex.unlock();
     }
-
 }

From d403bb012c31eb1b97b816f1a22f978e31e1fc93 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 10:48:26 +0100
Subject: [PATCH 255/268] README update

---
 .github/pull_request_template.md | 2 +-
 README.md                        | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 2035158dd..ed779442c 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,4 +1,4 @@
-**NOTE:** This challenge closes for submissions on Jan 31 23:59:59 UTC. No new pull requests for adding submissions will be accepted after that time.
+**NOTE:** The challenge has been closed for new submissions. No new pull requests for adding submissions are accepted at this time.
 Any pending pull requests will be reviewed over the next few days, as described [here](https://github.com/gunnarmorling/1brc/discussions/687).
 The final leader board will be published by Feb 5.
 
diff --git a/README.md b/README.md
index afd721ec8..de48e73f8 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
 # 1️⃣🐝🏎️ The One Billion Row Challenge
 
-_Status Jan 31: The challenge will close today at midnight UTC. No new pull requests will be accepted after that time. Pending PRs will be evaluated over the next few days. Please don't push any changes to pending PRs after today, unless being asked to do so.
+_Status Feb 1: The challenge has been closed for new submissions. No new pull requests for adding submissions are accepted at this time.
+Pending PRs will be evaluated over the next few days. Please don't push any changes to pending PRs after today, unless being asked to do so.
 This will be the case if I spot an issue during evaluation (failing tests, etc.). In this case, I will comment on the PR, and you are allowed to push one update.
 I will re-evaluate the entry, and if there are still remaining issues, you'll get one more and last opportunity to update the PR.
 If it still is not valid at this point, it will be closed.
-The final leader board will be published on Monday Feb 5._
+The final leader board will be published by Monday Feb 5._
+
+_Status Jan 31: The challenge will close today at midnight UTC._
 
 _Status Jan 12: As there has been such a large number of entries to this challenge so far (100+), and this is becoming hard to manage, please only create new submissions if you expect them to run in 10 seconds or less on the evaluation machine._
 

From dcc551241a554d801e5928bb2e5f5730e815d7bd Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 12:29:33 +0100
Subject: [PATCH 256/268] Leader board update

---
 README.md | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index de48e73f8..174c0fc1a 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
 _Status Feb 1: The challenge has been closed for new submissions. No new pull requests for adding submissions are accepted at this time.
 Pending PRs will be evaluated over the next few days. Please don't push any changes to pending PRs after today, unless being asked to do so.
 This will be the case if I spot an issue during evaluation (failing tests, etc.). In this case, I will comment on the PR, and you are allowed to push one update.
+Only changes strictly needed to fix the bug at hand may be pushed at this point.
+No force-pushes are allowed, so as to make sure I can see which changes have been made.
 I will re-evaluate the entry, and if there are still remaining issues, you'll get one more and last opportunity to update the PR.
 If it still is not valid at this point, it will be closed.
 The final leader board will be published by Monday Feb 5._
@@ -50,20 +52,21 @@ These are the results from running all entries into the challenge on eight cores
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:01.645 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
-| 2* | 00:01.832 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-| 2* | 00:01.851 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 3 | 00:01.880 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+| 1 | 00:01.535 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2 | 00:01.587 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 3 | 00:01.608 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+|   | 00:01.880 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
 |   | 00:01.921 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
 |   | 00:02.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java)| 21.0.2-graal | [Stephen Von Worley](https://github.com/stephenvonworley) | GraalVM native binary, uses Unsafe |
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-|   | 00:02.205 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:02.332 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
+|   | 00:02.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:02.367 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.507 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java)| 21.0.1-open | [gonix](https://github.com/gonix) | uses Unsafe |
-|   | 00:02.984 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
-|   | 00:03.013 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.31-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
+|   | 00:02.820 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.32-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
+|   | 00:02.995 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
+|   | 00:02.997 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.095 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.2-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:03.210 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
 |   | 00:03.298 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam (non-idiomatic)](https://github.com/vemana) | uses Unsafe |
@@ -72,9 +75,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:03.698 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
 |   | 00:03.785 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.2-graal | [zerninv](https://github.com/zerninv) | GraalVM native binary, uses Unsafe |
 |   | 00:03.820 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.2-graal | [John Ziamos](https://github.com/iziamos) | GraalVM native binary, uses Unsafe |
-|   | 00:03.824 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
 |   | 00:03.902 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 |   | 00:03.966 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
+|   | 00:03.991 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:04.066 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | uses Unsafe |
 |   | 00:04.101 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JaimePolidura.java)| 21.0.2-graal | [Jaime Polidura](https://github.com/JaimePolidura) | GraalVM native binary, uses Unsafe |
 |   | 00:04.209 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_giovannicuccu.java)| 21.0.1-open | [Giovanni Cuccu](https://github.com/giovannicuccu) |  |
@@ -90,13 +93,15 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:05.142 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_arjenw.java)| 21.0.1-open | [Arjen Wisse](https://github.com/arjenw) |  |
 |   | 00:05.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_melgenek.java)| 21.0.2-open | [Yevhenii Melnyk](https://github.com/melgenek) |  |
 |   | 00:05.235 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_unbounded.java)| 21.0.1-open | [unbounded](https://github.com/unbounded) |  |
+|   | 00:05.336 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_EduardoSaverin.java)| java | [Sumit Chaudhary](https://github.com/EduardoSaverin) | uses Unsafe |
 |   | 00:05.354 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_armandino.java)| 21.0.2-graal | [Arman Sharif](https://github.com/armandino) | GraalVM native binary, uses Unsafe |
 |   | 00:05.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) | uses Unsafe |
+|   | 00:05.559 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java)| 21.0.1-graal | [Panagiotis Drakatos](https://github.com/PanagiotisDrakatos) | GraalVM native binary |
 |   | 00:05.887 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_charlibot.java)| 21.0.1-graal | [Charlie Evans](https://github.com/charlibot) | uses Unsafe |
-|   | 00:05.960 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) | uses Unsafe |
 |   | 00:05.979 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_spullara.java)| 21.0.1-graal | [Sam Pullara](https://github.com/spullara) |  |
 |   | 00:06.166 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_isolgpus.java)| 21.0.1-open | [Jamie Stansfield](https://github.com/isolgpus) |  |
 |   | 00:06.257 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_flippingbits.java)| 21.0.1-graal | [Stefan Sprenger](https://github.com/flippingbits) | uses Unsafe |
+|   | 00:06.392 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_dpsoft.java)| 21.0.2-graal | [Diego Parra](https://github.com/dpsoft) |  |
 |   | 00:06.576 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_as-com.java)| 21.0.1-open | [Andrew Sun](https://github.com/as-com) | uses Unsafe |
 |   | 00:06.635 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_justplainlaake.java)| 21.0.1-graal | [Laake Scates-Gervasi](https://github.com/justplainlaake) | GraalVM native binary, uses Unsafe |
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
@@ -175,7 +180,6 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:22.334 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_albertoventurini.java)| 21.0.1-open | [Alberto Venturini](https://github.com/albertoventurini) |  |
 |   | 00:22.457 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rby.java)| 21.0.1-open | [Ramzi Ben Yahya](https://github.com/rby) |  |
 |   | 00:22.471 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_0xshivamagarwal.java)| 21.0.1-open | [Shivam Agarwal](https://github.com/0xshivamagarwal) |  |
-|   | 00:22.687 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_PanagiotisDrakatos.java)| 21.0.1-graal | [Panagiotis Drakatos](https://github.com/PanagiotisDrakatos) | GraalVM native binary |
 |   | 00:24.986 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_kumarsaurav123.java)| 21.0.1-open | [kumarsaurav123](https://github.com/kumarsaurav123) |  |
 |   | 00:25.064 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_sudhirtumati.java)| 21.0.2-open | [Sudhir Tumati](https://github.com/sudhirtumati) |  |
 |   | 00:26.500 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_felix19350.java)| 21.0.1-open | [Bruno Félix](https://github.com/felix19350) |  |

From 1ba9cdcf1552b7dcff8d46a9e9724671dd479fac Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 12:35:47 +0100
Subject: [PATCH 257/268] Comparing numbers is hard...

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 174c0fc1a..4fe0ffafc 100644
--- a/README.md
+++ b/README.md
@@ -61,9 +61,9 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:02.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
 |   | 00:02.319 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
 |   | 00:02.332 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
-|   | 00:02.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:02.367 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) | uses Unsafe |
 |   | 00:02.507 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java)| 21.0.1-open | [gonix](https://github.com/gonix) | uses Unsafe |
+|   | 00:02.557 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
 |   | 00:02.820 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.32-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
 |   | 00:02.995 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:02.997 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |

From 19d6d845e88b096bc71216d6bb654c5acd1a514c Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 13:07:02 +0100
Subject: [PATCH 258/268] Leaderboard update for 10K key set eval

---
 README.md | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 4fe0ffafc..ff0a5a7f2 100644
--- a/README.md
+++ b/README.md
@@ -263,33 +263,36 @@ Here are the results from running the top 15 entries (as of commit [2c26b511](ht
 #### 10K Key Set
 
 The 1BRC challenge data set contains 413 distinct weather stations, whereas the rules allow for 10,000 different station names to occur.
-Here are the results from running the top 15 entries (as of commit [f1209f2b](https://github.com/gunnarmorling/1brc/commit/f1209f2ba8e286474f08762f9e4f161981e39cee), Jan 27) against 1,000,000,000 measurement values across 10K stations (created via _./create_measurements3.sh 1000000000_),
+Here are the results from running the top 25 entries (as of commit [1ba9cdcf](https://github.com/gunnarmorling/1brc/commit/1ba9cdcf1552b7dcff8d46a9e9724671dd479fac), Feb 1) against 1,000,000,000 measurement values across 10K stations (created via _./create_measurements3.sh 1000000000_),
 using eight cores on the evaluation machine:
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:02.741 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
-| 2 | 00:04.001 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
-| 3 | 00:04.516 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
-|   | 00:04.816 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [Subrahmanyam](https://github.com/vemana) | uses Unsafe |
-|   | 00:04.848 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.1-graal | [Jaromir Hamala](https://github.com/jerrinot) | uses Unsafe |
-|   | 00:05.127 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) | uses Unsafe |
-|   | 00:05.614 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
-|   | 00:05.670 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
-|   | 00:06.111 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) | uses Unsafe |
-|   | 00:06.929 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) | uses Unsafe |
-|   | 00:09.018 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) | uses Unsafe |
-|   | 00:10.038 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [merykittyunsafe](https://github.com/merykittyunsafe) | uses Unsafe |
-|   | 00:10.197 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
-|   | 00:12.567 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
-|   | 00:12.602 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
-|   | 00:15.896 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
-|   | 00:18.064 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
-|   | 00:20.374 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman-r-m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary |
-|   | ---       | | | | |
-|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JesseVanRooy.java)| 21.0.1-open | [JesseVanRooy](https://github.com/JesseVanRooy) | Incorrect output |
-|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) | Didn't complete in 60 sec |
-|   | DNF | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.1-graal | [zerninv](https://github.com/zerninv) | Seg fault |
+| 1 | 00:02.977 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 2 | 00:03.068 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
+| 3 | 00:03.175 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java)| 21.0.2-graal | [Stephen Von Worley](https://github.com/stephenvonworley) | GraalVM native binary, uses Unsafe |
+|   | 00:04.022 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+|   | 00:04.047 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+|   | 00:04.122 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java)| 21.0.1-open | [gonixunsafe](https://github.com/gonixunsafe) | uses Unsafe |
+|   | 00:04.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
+|   | 00:04.655 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.2-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
+|   | 00:04.708 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
+|   | 00:04.797 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue) | GraalVM native binary, uses Unsafe |
+|   | 00:04.814 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [vemanaNonIdiomatic](https://github.com/vemanaNonIdiomatic) | uses Unsafe |
+|   | 00:05.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.2-graal | [zerninv](https://github.com/zerninv) | GraalVM native binary, uses Unsafe |
+|   | 00:05.367 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
+|   | 00:05.894 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.2-graal | [Elliot Barlas](https://github.com/ebarlas) | GraalVM native binary, uses Unsafe |
+|   | 00:06.014 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+|   | 00:06.380 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.2-graal | [John Ziamos](https://github.com/iziamos) | GraalVM native binary, uses Unsafe |
+|   | 00:08.830 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+|   | 00:09.349 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
+|   | 00:10.388 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [merykittyunsafe](https://github.com/merykittyunsafe) | uses Unsafe |
+|   | 00:12.467 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
+|   | 00:13.225 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
+|   | 00:15.901 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
+|   | 00:17.972 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:20.174 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
+|   | 00:21.591 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.32-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
 
 ## Prerequisites
 

From dda3c3b3116cb7786173f453fb189e50a87763bc Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 14:16:44 +0100
Subject: [PATCH 259/268] Leaderboard update 32 cores

---
 README.md | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index ff0a5a7f2..7d5dc36b4 100644
--- a/README.md
+++ b/README.md
@@ -240,25 +240,35 @@ For the 1BRC challenge, only the results in the previous section are of importan
 #### 32 Cores / 64 Threads
 
 For officially evaluating entries into the challenge, each contender is run on eight cores of the evaluation machine (AMD EPYC™ 7502P).
-Here are the results from running the top 15 entries (as of commit [2c26b511](https://github.com/gunnarmorling/1brc/commit/2c26b511e741f4d96a51dda831001946ea27a591)) on all 32 cores / 64 threads (i.e. SMT is enabled) of the machine:
+Here are the results from running the top 25 entries (as of commit [1ba9cdcf](https://github.com/gunnarmorling/1brc/commit/1ba9cdcf1552b7dcff8d46a9e9724671dd479fac), Feb 1) on all 32 cores / 64 threads (i.e. SMT is enabled) of the machine:
 
 | # | Result (m:s.ms) | Implementation     | JDK | Submitter     | Notes     |
 |---|-----------------|--------------------|-----|---------------|-----------|
-| 1 | 00:00.799 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.1-graal | [Thomas Wuerthinger](https://github.com/thomaswue) | GraalVM native binary |
-| 2 | 00:00.933 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.1-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary |
-| 3 | 00:01.236 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.1-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) |  |
-|   | 00:01.380 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [merykittyunsafe](https://github.com/merykittyunsafe) |  |
-|   | 00:01.383 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_cliffclick.java)| 21.0.1-open | [Cliff Click](https://github.com/cliffclick) |  |
-|   | 00:01.429 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.1-open | [John Ziamos](https://github.com/iziamos) |  |
-|   | 00:01.464 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_obourgain.java)| 21.0.1-open | [Olivier Bourgain](https://github.com/obourgain) |  |
-|   | 00:01.603 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.1-open | [Van Phu DO](https://github.com/abeobk) |  |
-|   | 00:01.748 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.1-graal | [Yavuz Tas](https://github.com/yavuztas) |  |
-|   | 00:01.778 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
-|   | 00:01.942 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.1-graal | [Marko Topolnik](https://github.com/mtopolnik) |  |
-|   | 00:01.972 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.1-graal | [Elliot Barlas](https://github.com/ebarlas) |  |
-|   | 00:02.111 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.1-graal | [Jamal Mulla](https://github.com/JamalMulla) |  |
-|   | 00:02.644 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vaidhy.java)| 21.0.1-graal | [Vaidhy Mayilrangam](https://github.com/vaidhy) |  |
-|   | 00:03.697 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+| 1* | 00:00.324 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jerrinot.java)| 21.0.2-graal | [Jaromir Hamala](https://github.com/jerrinot) | GraalVM native binary, uses Unsafe |
+| 1* | 00:00.326 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
+| 2* | 00:00.350 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_artsiomkorzun.java)| 21.0.2-graal | [Artsiom Korzun](https://github.com/artsiomkorzun) | GraalVM native binary, uses Unsafe |
+| 2* | 00:00.351 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_abeobk.java)| 21.0.2-graal | [Van Phu DO](https://github.com/abeobk) | GraalVM native binary, uses Unsafe |
+| 3 | 00:00.389 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_stephenvonworley.java)| 21.0.2-graal | [Stephen Von Worley](https://github.com/stephenvonworley) | GraalVM native binary, uses Unsafe |
+|   | 00:00.410 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |
+|   | 00:00.410 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_royvanrijn.java)| 21.0.2-graal | [Roy van Rijn](https://github.com/royvanrijn) | GraalVM native binary, uses Unsafe |
+|   | 00:00.502 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mtopolnik.java)| 21.0.2-graal | [Marko Topolnik](https://github.com/mtopolnik) | GraalVM native binary, uses Unsafe |
+|   | 00:00.609 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_roman_r_m.java)| 21.0.1-graal | [Roman Musin](https://github.com/roman-r-m) | GraalVM native binary, uses Unsafe |
+|   | 00:00.611 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonixunsafe.java)| 21.0.1-open | [gonixunsafe](https://github.com/gonixunsafe) | uses Unsafe |
+|   | 00:00.716 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.2-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
+|   | 00:00.728 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
+|   | 00:00.764 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_serkan_ozal.java)| 21.0.1-open | [Serkan ÖZAL](https://github.com/serkan-ozal) | uses Unsafe |
+|   | 00:00.785 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ebarlas.java)| 21.0.2-graal | [Elliot Barlas](https://github.com/ebarlas) | GraalVM native binary, uses Unsafe |
+|   | 00:00.814 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
+|   | 00:00.838 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.2-graal | [zerninv](https://github.com/zerninv) | GraalVM native binary, uses Unsafe |
+|   | 00:00.877 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_iziamos.java)| 21.0.2-graal | [John Ziamos](https://github.com/iziamos) | GraalVM native binary, uses Unsafe |
+|   | 00:01.179 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [vemanaNonIdiomatic](https://github.com/vemanaNonIdiomatic) | uses Unsafe |
+|   | 00:01.268 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykittyunsafe.java)| 21.0.1-open | [merykittyunsafe](https://github.com/merykittyunsafe) | uses Unsafe |
+|   | 00:01.289 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_linl33.java)| 22.ea.32-open | [Li Lin](https://github.com/linl33) | uses Unsafe |
+|   | 00:01.345 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_hundredwatt.java)| 21.0.1-graal | [Jason Nochlin](https://github.com/hundredwatt) |  |
+|   | 00:01.393 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_merykitty.java)| 21.0.1-open | [Quan Anh Mai](https://github.com/merykitty) |  |
+|   | 00:01.478 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yourwass.java)| 21.0.1-open | [yourwass](https://github.com/yourwass) | uses Unsafe |
+|   | 00:01.770 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jincongho.java)| 21.0.1-open | [Jin Cong Ho](https://github.com/jincongho) | uses Unsafe |
+|   | 00:02.918 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jparera.java)| 21.0.1-open | [Juan Parera](https://github.com/jparera) |  |
 
 #### 10K Key Set
 
@@ -277,7 +287,7 @@ using eight cores on the evaluation machine:
 |   | 00:04.520 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tivrfoa.java)| 21.0.2-graal | [tivrfoa](https://github.com/tivrfoa) | GraalVM native binary, uses Unsafe |
 |   | 00:04.655 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JamalMulla.java)| 21.0.2-graal | [Jamal Mulla](https://github.com/JamalMulla) | GraalVM native binary, uses Unsafe |
 |   | 00:04.708 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gonix.java)| 21.0.1-open | [gonix](https://github.com/gonix) |  |
-|   | 00:04.797 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue) | GraalVM native binary, uses Unsafe |
+|   | 00:04.797 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_thomaswue.java)| 21.0.2-graal | [Thomas Wuerthinger](https://github.com/thomaswue), [Quan Anh Mai](https://github.com/merykitty), [Alfonso² Peterssen](https://github.com/mukel) | GraalVM native binary, uses Unsafe |
 |   | 00:04.814 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_vemanaNonIdiomatic.java)| 21.0.1-graal | [vemanaNonIdiomatic](https://github.com/vemanaNonIdiomatic) | uses Unsafe |
 |   | 00:05.248 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_zerninv.java)| 21.0.2-graal | [zerninv](https://github.com/zerninv) | GraalVM native binary, uses Unsafe |
 |   | 00:05.367 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yavuztas.java)| 21.0.2-graal | [Yavuz Tas](https://github.com/yavuztas) | GraalVM native binary, uses Unsafe |

From 1b23172afb069a84979c738595b9025e36b79816 Mon Sep 17 00:00:00 2001
From: JurenIvan <43958858+JurenIvan@users.noreply.github.com>
Date: Thu, 1 Feb 2024 14:30:22 +0100
Subject: [PATCH 260/268] My first submission  (#697)

* Common sense implementation

* fix filename

* formatting

* remove excess system.out.println

* fix hash collisions

* ajdust so taht segment size smaller than Integer.MAX_VALUE
---
 calculate_average_JurenIvan.sh                |  19 ++
 .../onebrc/CalculateAverage_JurenIvan.java    | 219 ++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100755 calculate_average_JurenIvan.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java

diff --git a/calculate_average_JurenIvan.sh b/calculate_average_JurenIvan.sh
new file mode 100755
index 000000000..73d956e90
--- /dev/null
+++ b/calculate_average_JurenIvan.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_JurenIvan
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java
new file mode 100644
index 000000000..3f9306899
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java
@@ -0,0 +1,219 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.lang.Math.round;
+import static java.nio.channels.FileChannel.MapMode.READ_ONLY;
+import static java.nio.file.StandardOpenOption.READ;
+
+public class CalculateAverage_JurenIvan {
+
+    private static final String FILE_NAME = "./measurements.txt";
+
+    public static void main(String[] args) throws IOException {
+        long[] segments = getSegments(Runtime.getRuntime().availableProcessors());
+
+        var result = IntStream.range(0, segments.length - 1)
+                .parallel()
+                .mapToObj(i -> processSegment(segments[i], segments[i + 1]))
+                .flatMap(m -> Arrays.stream(m.hashTable).filter(Objects::nonNull))
+                .collect(Collectors.toMap(m -> new String(m.city), m -> m, Measurement::merge, TreeMap::new));
+
+        System.out.println(result);
+    }
+
+    private static LinearProbingHashMap processSegment(long start, long end) {
+        var results = new LinearProbingHashMap(1 << 19);
+
+        try (var fileChannel = (FileChannel) Files.newByteChannel(Path.of(FILE_NAME), READ)) {
+            var bb = fileChannel.map(READ_ONLY, start, end - start);
+            var buffer = new byte[100];
+
+            int limit = bb.limit();
+            for (int startLine = bb.position(); startLine < limit; startLine = bb.position()) {
+                int currentPosition = startLine;
+
+                byte b;
+                int hash = 7;
+                int wordLen = 0;
+                while (currentPosition < end && (b = bb.get(currentPosition++)) != ';') {
+                    buffer[wordLen++] = b;
+                    hash = hash * 31 + b;
+                }
+
+                int temp;
+                int negative = 1;
+                if (bb.get(currentPosition) == '-') {
+                    negative = -1;
+                    currentPosition++;
+                }
+
+                if (bb.get(currentPosition + 1) == '.') {
+                    temp = negative * ((bb.get(currentPosition) - '0') * 10 + (bb.get(currentPosition + 2) - '0'));
+                    currentPosition += 3;
+                }
+                else {
+                    temp = negative * ((bb.get(currentPosition) - '0') * 100 + ((bb.get(currentPosition + 1) - '0') * 10 + (bb.get(currentPosition + 3) - '0')));
+                    currentPosition += 4;
+                }
+
+                currentPosition++;
+
+                results.put(hash, buffer, wordLen, temp);
+
+                bb.position(currentPosition);
+            }
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return results;
+    }
+
+    private static long[] getSegments(int segmentCount) throws IOException {
+        try (var raf = new RandomAccessFile(FILE_NAME, "r")) {
+            long fileSize = raf.length();
+
+            if (fileSize < 100000) {
+                long[] chunks = new long[2];
+                chunks[1] = fileSize;
+                return chunks;
+            }
+
+            while (fileSize / segmentCount >= (Integer.MAX_VALUE - 150)) {
+                segmentCount *= 2;
+            }
+
+            long[] chunks = new long[segmentCount + 1];
+
+            chunks[0] = 0;
+            long segmentSize = fileSize / segmentCount;
+
+            for (int i = 1; i < segmentCount; i++) {
+                long chunkOffset = chunks[i - 1] + segmentSize;
+                raf.seek(chunkOffset);
+                while (raf.readByte() != '\n') {
+                }
+                chunks[i] = raf.getFilePointer();
+            }
+            chunks[segmentCount] = fileSize;
+            return chunks;
+        }
+    }
+
+    public static class LinearProbingHashMap {
+        final Measurement[] hashTable;
+        int slots;
+
+        public LinearProbingHashMap(int slots) {
+            this.slots = slots;
+            this.hashTable = new Measurement[slots];
+        }
+
+        void put(int hash, byte[] key, int len, int temperature) {
+            hash = Math.abs(hash);
+            int index = hash & (slots - 1);
+
+            int i = index;
+            while (hashTable[i] != null) {
+                if (keyIsEqual(key, hashTable[i].city, len)) { // handling hash collisions
+                    hashTable[i].add(temperature);
+                    return;
+                }
+                i++;
+                if (i == slots) {
+                    i = 0;
+                }
+            }
+
+            var cityArr = new byte[len];
+            System.arraycopy(key, 0, cityArr, 0, len);
+            hashTable[i] = new Measurement(cityArr, hash, temperature, temperature, 1, temperature);
+        }
+
+        private boolean keyIsEqual(byte[] one, byte[] other, int len) {
+            if (len != other.length)
+                return false;
+            for (int i = 0; i < len; i++) {
+                if (one[i] != other[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+    }
+
+    static class Measurement {
+        byte[] city;
+        int hash;
+        int min;
+        int max;
+        int count;
+        long sum;
+
+        public Measurement(byte[] city, int hash, int min, int max, int count, long sum) {
+            this.city = city;
+            this.hash = hash;
+            this.min = min;
+            this.max = max;
+            this.count = count;
+            this.sum = sum;
+        }
+
+        public void add(int temperature) {
+            min = Math.min(min, temperature);
+            max = Math.max(max, temperature);
+            count++;
+            sum += temperature;
+        }
+
+        public Measurement merge(Measurement other) {
+            min = Math.min(min, other.min);
+            max = Math.max(max, other.max);
+            count += other.count;
+            sum += other.sum;
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            return (min * 1.0) / 10 + "/" + round((sum * 1.0) / count) / 10.0 + "/" + (max * 1.0) / 10;
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
+        }
+
+        @Override
+        public boolean equals(Object obj) {
+            return Arrays.equals(city, ((Measurement) obj).city);
+        }
+    }
+}

From bc391cbe8b139a056d6d5f6f63c8d0f37b8db9dd Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 14:48:37 +0100
Subject: [PATCH 261/268] Leaderboard update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7d5dc36b4..a29edb7bc 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.894 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java)| 21.0.2-tem | [Antonio Muñoz](https://github.com/tonivade) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
+|   | 00:08.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java)| 21.0.1-open | [JurenIvan](https://github.com/JurenIvan) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |
 |   | 00:08.255 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_mattiz.java)| 21.0.1-open | [Mathias Bjerke](https://github.com/mattiz) |  |

From 0c7cb8925b7e225fb6872bb86b8a4dc956cee15d Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 14:56:29 +0100
Subject: [PATCH 262/268] README update

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a29edb7bc..f9c044b57 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ _Status Jan 12: As there has been such a large number of entries to this challen
 
 _Status Jan 1: This challenge is [open for submissions](https://www.morling.dev/blog/one-billion-row-challenge/)!_
 
+> **Sponsorship**
+>
+> A big thank you to my employer [Decodable](https://www.decodable.co/) for funding the evaluation environment and supporting this challenge!
+
 The One Billion Row Challenge (1BRC) is a fun exploration of how far modern Java can be pushed for aggregating one billion rows from a text file.
 Grab all your (virtual) threads, reach out to SIMD, optimize your GC, or pull any other trick, and create the fastest implementation for solving this task!
 
@@ -479,10 +483,6 @@ A list of external resources such as blog posts and videos, discussing 1BRC and
 * [One Billion Row Challenge & Azure Data Explorer](https://nielsberglund.com/post/2024-01-28-one-billion-row-challenge--azure-data-explorer/), by Niels Berglund (blog post)
 * [One Billion Row Challenge - view from sidelines](https://www.chashnikov.dev/post/one-billion-row-challenge-view-from-sidelines), by Leo Chashnikov (blog post)
 
-## Sponsorship
-
-A big thank you to my employer [Decodable](https://www.decodable.co/) for funding the evaluation environment and supporting this challenge!
- 
 ## License
 
 This code base is available under the Apache License, version 2.

From 8ab88e9f5c5e5a3c55293aac1eb0401e0179e4a4 Mon Sep 17 00:00:00 2001
From: Chris Bellew <cjbellew@gmail.com>
Date: Thu, 1 Feb 2024 23:59:05 +0800
Subject: [PATCH 263/268] SIMD parsing newlines, integer parsing, custom
 hashtable with SIMD lookup table for equality (#663)

* Add submission

* Added explanatory comment

* Added comment

* Rename shell script

* Commit formatting

* When last bytes don't fill a vector, take directly

* Add comment

* Deal with subset collisions
---
 calculate_average_ChrisBellew.sh              |  19 +
 prepare_chrisbellew.sh                        |  20 +
 .../onebrc/CalculateAverage_chrisbellew.java  | 738 ++++++++++++++++++
 3 files changed, 777 insertions(+)
 create mode 100755 calculate_average_ChrisBellew.sh
 create mode 100755 prepare_chrisbellew.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_chrisbellew.java

diff --git a/calculate_average_ChrisBellew.sh b/calculate_average_ChrisBellew.sh
new file mode 100755
index 000000000..122ddce64
--- /dev/null
+++ b/calculate_average_ChrisBellew.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--add-modules jdk.incubator.vector --enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_chrisbellew
\ No newline at end of file
diff --git a/prepare_chrisbellew.sh b/prepare_chrisbellew.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_chrisbellew.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_chrisbellew.java b/src/main/java/dev/morling/onebrc/CalculateAverage_chrisbellew.java
new file mode 100644
index 000000000..8b8b8fcdc
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_chrisbellew.java
@@ -0,0 +1,738 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.reflect.Field;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.HashMap;
+import java.util.TreeMap;
+import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.VectorMask;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import sun.misc.Unsafe;
+
+/**
+ * This is Chris Bellew's implementation. Here are the key points:
+ * 
+ * - The file is equally split into ranges, one range per thread.
+ *   18 threads was experimentally found to be optimal.
+ * 
+ * - Each thread memory maps the file range it is responsible for and
+ *   then iterates through the range, one smaller buffer at a time.
+ * 
+ * - The contents are parsed by using SIMD vector equality comparisons
+ *   between the source data and the newline character, effectively
+ *   delimiting each line. The measurement of each line is discovered
+ *   by moving back from the end of the line, parsing into an integer
+ *   as it goes. The integer representation is 10x the actual value
+ *   but is used because integer parsing was found to be much faster
+ *   than floating point parsing, and it's also immune to floating
+ *   point arithmetic errors when aggregating the measurements later.
+ * 
+ * - Once the name and the measurement is parsed for a line, the name
+ *   is hashed and used a lookup into a hash table. The value of the
+ *   hash table at the given slot is an index into another array, this
+ *   time an array of SIMD vectors that represent that name as a series
+ *   of vectors. The vectors are used to compare equality of the name of
+ *   the source line with the name in the slot to confirm the slot is
+ *   occupied by the same city name. The indirection of having a hash
+ *   table storing lookups into another array of vectors is to allow
+ *   the hash table slots to have a fixed size, while allowing the city
+ *   names to be arbitrarily long. The hash table can then use open
+ *   addressing to resolve collisions and remain efficient for lookups.
+ * 
+ * - After the range has been processed, the results are collected by
+ *   iterating through the hash table and looking up the corresponding
+ *   integer table for each slot then collecting the min, max, count
+ *   and sum of the measurements for each city. Then the results are
+ *   combined from all threads, using a treemap for sorting, and printed.
+ */
+public final class CalculateAverage_chrisbellew {
+    public static final long FILE_SIZE = getFileSize();
+
+    /**
+     * The overlap is the number of bytes that is peeked into the next buffer
+     * in order to find the end of the last newline in the current buffer.
+     * Every buffer ignores the characters before the first newline character
+     * and peeks into the next buffer to find the first newline character. This
+     * way no data is lost even though the buffers are arbitrarily sliced.
+     * 100 is the maximum length of a city name, 1 is the semicolon character,
+     * 5 is the maximum length of a measurement, 1 is the newline character,
+     * 8 is one extra vector length so that we don't overflow the buffer.
+     * If we overlap to this length then we will always be able to complete the
+     * last line in the buffer.
+     */
+    public static final int OVERLAP = 100 + 1 + 5 + 1 + 8;
+
+    public static void main(String[] args) throws IOException {
+        /**
+         * The test cases use small test files. This causes issues because we
+         * are trying to open the file at different locations on 16 threads.
+         */
+        final int NUM_THREADS = FILE_SIZE < 12_000_000_000L ? 1 : 16;
+
+        /**
+         * Experimentally optimal buffer size for iterating over each
+         * memory mapped segment of the file.
+         */
+        final int BUFFER_SIZE = 1024 * 256;
+
+        /**
+         * Split the whole file into slices. One slice per thread.
+         */
+        var ranges = getThreadRanges(NUM_THREADS);
+
+        var processors = new ThreadProcessor[NUM_THREADS];
+        Thread[] threads = new Thread[NUM_THREADS];
+        for (var i = 0; i < NUM_THREADS; i++) {
+            processors[i] = new ThreadProcessor(ranges[i].start, ranges[i].end, BUFFER_SIZE);
+            threads[i] = new Thread(processors[i]);
+            threads[i].start();
+        }
+
+        var results = new TreeMap<String, CityResult>();
+        for (int i = 0; i < NUM_THREADS; i++) {
+            try {
+                threads[i].join();
+                processors[i].collectResults(results);
+            }
+            catch (InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+        }
+        printResults(results);
+    }
+
+    private static void printResults(TreeMap<String, CityResult> results) {
+        var builder = new StringBuilder();
+        builder.append("{");
+        boolean first = true;
+        for (var entry : results.entrySet()) {
+            var city = entry.getKey();
+            var result = entry.getValue();
+            var average = ((float) Math.round((float) result.sum / (float) result.count)) / 10.0;
+            var min = ((float) result.min) / 10.0;
+            var max = ((float) result.max) / 10.0;
+
+            if (first) {
+                first = false;
+            }
+            else {
+                builder.append(", ");
+            }
+            builder.append(city).append("=").append(min).append("/").append(average).append("/").append(max);
+        }
+        builder.append("}");
+        System.out.println(builder.toString());
+    }
+
+    /**
+     * Splits the measurements file into ranges for each thread, ensuring that the last
+     * range ends at the end of the file.
+     */
+    public static final FileRange[] getThreadRanges(int threads) throws IOException {
+        var chunkSize = FILE_SIZE / threads;
+        var ranges = new FileRange[threads];
+        for (var i = 0; i < threads; i++) {
+            var start = i * chunkSize;
+            var end = i == threads - 1 ? FILE_SIZE : (i + 1) * chunkSize;
+            ranges[i] = new FileRange(start, end);
+        }
+        return ranges;
+    }
+
+    private static final long getFileSize() {
+        try (var stream = new FileInputStream("measurements.txt")) {
+            return stream.getChannel().size();
+        }
+        catch (IOException e) {
+            throw new RuntimeException("Failed to get file size", e);
+        }
+    }
+
+    /**
+     * Processes a range of the file. The range is defined by a start and end
+     * position. The start is inclusive and the end is exclusive.
+     */
+    static final class ThreadProcessor implements Runnable {
+        /**
+         * The number of slots in the hash table. This number was found to be the
+         * minimum number to use in conjunction with the hashing function to
+         * produce no collisions on the test data. The test data is a hint, but the
+         * correctness of the implementation is not coupled to the test data because
+         * the hash table is able to handle collisions in other arbitrary source data.
+         */
+        private static final int NUM_SLOTS = 12133;
+
+        /**
+         * The size of the SIMD vector to use when striding through the source data
+         * in order to detect newlines, and when comparing equality of the source line
+         * with a given slot in the hash table.
+         */
+        private static final VectorSpecies<Byte> SPECIES = ByteVector.SPECIES_64;
+
+        /**
+         * A precomputed lookup table of vector masks to use when comparing equality of
+         * the source line and a given slot in the hash table. Each slot in the hash table
+         * has a set of vectors associated with it. The source name is split into vectors
+         * and each source vector is compared with the corresponding slot vector for equality.
+         * Unless the length of the city name is a multiple of the vector length, the last
+         * vector in the slot will be a partial vector. The masks are used to ignore the
+         * unused bytes in the last vector.
+         */
+        private static final VectorMask<Byte>[] MASKS = generateMasks(SPECIES);
+
+        /**
+         * The unsafe instance is used to allocate memory for the hash table slots
+         * and integer table slots. It skips the JVM's garbage collector and allows
+         * the memory to be accessed directly without overhead such as bounds checks.
+         */
+        private static final Unsafe unsafe = getUnsafe();
+
+        /**
+         * The start and end positions this thread will iterate through.
+         */
+        private final long start;
+        private final long end;
+
+        private final int bufferSize;
+
+        /**
+         * The main memory address at the beginning of the hash table slots.
+         */
+        private final long slotsAddress;
+
+        /**
+         * The main memory address at the beginning of the integer table slots.
+         */
+        private final long numbersAddress;
+
+        /**
+         * The main memory address at the beginning of the name length table slots.
+         */
+        private final long lengthsAddress;
+
+        /**
+         * The SIMD vectors associated with each slot in the hash table. The
+         * content of a given slot in a hash table is a lookup into this array.
+         * The intent of having this array as an extra lookup is to allow N
+         * vectors per slot while having fixed size slots.
+         */
+        private ByteVector[] vectors = new ByteVector[200000];
+        private String[] cityNames = new String[NUM_SLOTS];
+
+        /**
+         * The next available index in the vectors array.
+         */
+        private short nextVectorIndex = 8;
+
+        /**
+         * A map of city name strings to their corresponding slot index in the
+         * hash table. When the hash table slots will be sparsely populated it's
+         * not efficient to iterate through the slots when collecting the results.
+         * This map provides a way to discover the occupied slots.
+         */
+        private final HashMap<String, Integer> cityVectorLookup = new HashMap<>();
+
+        public ThreadProcessor(long start, long end, int bufferSize) {
+            this.start = start;
+            this.end = end;
+            this.bufferSize = bufferSize;
+
+            /**
+             * Allocate memory for the hash table and the integer table.
+             * Initialise the hash table slots to 0, so we can use 0 to
+             * indicate an empty slot.
+             */
+            slotsAddress = unsafe.allocateMemory(NUM_SLOTS * 2);
+            for (int i = 0; i < NUM_SLOTS; i++) {
+                unsafe.putShort(slotsAddress + i * 2, (short) 0);
+            }
+            numbersAddress = unsafe.allocateMemory(NUM_SLOTS * 16);
+            lengthsAddress = unsafe.allocateMemory(NUM_SLOTS);
+        }
+
+        public final void run() {
+            try (RandomAccessFile file = new RandomAccessFile("measurements.txt", "r")) {
+                FileChannel fileChannel = file.getChannel();
+
+                /**
+                 * Work out whether we need to peek into the next range. If this is the last
+                 * range then the end of this range will be the end of the file, so we won't
+                 * peek. Otherwise, we'll peek just enough into the next slot to complete the
+                 * last line in this range.
+                 */
+                boolean lastRange = end == FILE_SIZE;
+                long length = lastRange ? end - start : end - start + OVERLAP;
+
+                MappedByteBuffer buffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, start, length);
+                processRange(buffer, lastRange);
+            }
+            catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        /**
+         * Iterates through the entire memory mapped range, one buffer at a time.
+         * The buffers are made to overlap to allow each buffer to peek into the next
+         * range to complete the last line. 
+         */
+        private final void processRange(MappedByteBuffer buffer, boolean lastRange) {
+            byte[] buf = new byte[bufferSize];
+            int remaining;
+            long globalPosition = start;
+            while ((remaining = buffer.remaining()) != 0) {
+                int numBytes = Math.min(remaining, bufferSize);
+                boolean lastBuffer = remaining == numBytes;
+
+                /**
+                 * Fill this buffer and process it.
+                 */
+                buffer.get(buf, 0, numBytes);
+                processBuffer(buf, numBytes, lastRange, lastBuffer, globalPosition);
+
+                /**
+                 * Start the next range slightly before the end of this range.
+                 */
+                if (!lastBuffer) {
+                    buffer.position(buffer.position() - OVERLAP);
+                }
+
+                globalPosition += numBytes;
+            }
+        }
+
+        /**
+         * Parses and processes each line from a buffer.
+         */
+        private final void processBuffer(byte[] buffer, int numBytes, boolean lastRange, boolean lastBuffer, long globalPosition) {
+
+            /**
+             * Skip past any characters before the first newline because the previous
+             * segment will have already processed them. That is unless this if the 
+             * first buffer in the first range (global position zero), in which case
+             * we will start from the first character.
+             */
+            int index = globalPosition == 0 ? 0 : findFirstNewline(buffer) + 1;
+
+            /**
+             * Keep track of the start of the city name.
+             */
+            int nameStart = index;
+
+            while (true) {
+                /**
+                 * Take a slice of bytes and convert it into a vector so we can apply
+                 * SIMD operations to find newlines.
+                 */
+                ByteVector vector = ByteVector.fromArray(SPECIES, buffer, index);
+
+                /**
+                 * Find the newline using SIMD.
+                 */
+                VectorMask<Byte> newLineMask = vector.eq((byte) '\n');
+                int firstTrue = newLineMask.firstTrue();
+                if (firstTrue == SPECIES.length()) {
+                    /**
+                     * We haven't found a newline in this vector, so move on to the
+                     * next vector.
+                     */
+                    index += SPECIES.length();
+                    continue;
+                }
+
+                slice(buffer, index + firstTrue, nameStart);
+
+                index = index + firstTrue + 1;
+                nameStart = index;
+
+                /**
+                 * If this is the last buffer in the last range then we want to 
+                 * process every character until the very end of the file.
+                 */
+                if (lastRange && lastBuffer) {
+                    if (index == numBytes) {
+                        return;
+                    }
+
+                    /**
+                     * If we're less than one vector length away from the end
+                     * of the buffer then just take the remaining bytes as the
+                     * final line. If we tried to use a vector it would overflow.
+                     */
+                    if (index >= numBytes - SPECIES.length()) {
+                        slice(buffer, numBytes - 1, nameStart);
+                        return;
+                    }
+                    continue;
+                }
+
+                /**
+                 * If it's not the last buffer or it's not the last range then
+                 * we want to overlap into the next buffer, but only by enough
+                 * to complete the last line.
+                 */
+                if (index > numBytes - OVERLAP) {
+                    return;
+                }
+            }
+        }
+
+        /**
+         * Finds the first newline in a buffer using SIMD. Used to skip past a
+         * partial line at the beginning of a buffer.
+         */
+        private final int findFirstNewline(byte[] buffer) {
+            int index = 0;
+            while (true) {
+                ByteVector vector = ByteVector.fromArray(SPECIES, buffer, index);
+                VectorMask<Byte> newLineMask = vector.eq((byte) '\n');
+                int firstTrue = newLineMask.firstTrue();
+                if (firstTrue == SPECIES.length()) {
+                    index += SPECIES.length();
+                    continue;
+                }
+                return index + firstTrue;
+            }
+        }
+
+        /**
+         * Given the index in the buffer of where a name starts, and the index of
+         * the next newline, creeps back from the next newline to find the structure
+         * of the measurement, parsing it into a number as it goes. It is parsed 
+         * into an integer because it's faster than parsing as a float, and it's also
+         * immune to floating point arithmetic errors when aggregating the measurements
+         * later.
+         * 
+         * Then proceeds to record the fully parsed name and measurement in the hash table.
+         */
+        private final void slice(byte[] buffer, int newlineIndex, int nameStart) {
+            int i = newlineIndex - 1;
+            int measurement = buffer[i] - '0';
+            i -= 2; // Skip before the decimal point
+            measurement += (buffer[i] - '0') * 10;
+            i--;
+
+            if (buffer[i] == ';') {
+                // 1.2
+                record(buffer, nameStart, i, measurement);
+            }
+            else {
+                // 12.3 or -1.2 or -12.3
+                if (buffer[i] == '-') {
+                    // -1.2
+                    record(buffer, nameStart, i - 1, -measurement);
+                }
+                else {
+                    // 12.3 or -12.3
+                    measurement += (buffer[i] - '0') * 100;
+                    i--;
+                    if (buffer[i] == '-') {
+                        // -12.3
+                        record(buffer, nameStart, i - 1, -measurement);
+                    }
+                    else {
+                        // 12.3
+                        record(buffer, nameStart, i, measurement);
+                    }
+                }
+            }
+        }
+
+        /**
+         * Given a name and measurement, looks up a slot in the hash table by hashing
+         * the city name as a key, then applies the measurement to the accumulated
+         * aggregation of that city's measurements.
+         */
+        private final void record(byte[] buffer, int nameStart, int nameEnd, int measurement) {
+            int nameLength = nameEnd - nameStart;
+
+            /**
+             * The length of most city names will not be a multiple of the SIMD
+             * vector length so there will be a remainder in the final vector
+             * of extraneous bytes. We need to mask these bytes out when comparing.
+             */
+            var remainder = nameLength % SPECIES.length();
+
+            var numVectors = nameLength / SPECIES.length() + (remainder == 0 ? 0 : 1);
+
+            /**
+             * Lookup the slot index in the hash table for the city name.
+             */
+            var slotIndex = nameToSlotIndex(buffer, nameStart, nameLength);
+
+            /**
+             * Identify if the slot is occupied, then check the equality of the
+             * slot with the city name.
+             */
+            var vectorOffset = unsafe.getShort(slotsAddress + slotIndex * 2);
+            while (vectorOffset != 0) {
+
+                /**
+                 * Check the set of vectors in the slot match the city name
+                 */
+                if (slotEquals(buffer, nameStart, vectorOffset, numVectors, remainder, slotIndex)) {
+
+                    /**
+                     * Check the length of the slot name and city name match. This
+                     * check is needed because the vector equality check can give
+                     * false positives if one city name starts with another.
+                     */
+                    byte slotNameLength = unsafe.getByte(lengthsAddress + slotIndex);
+                    if (slotNameLength == nameLength) {
+                        updateSlot(slotIndex, measurement);
+                        break;
+                    }
+                }
+
+                /**
+                 * If the slot is occupied but the city name doesn't match, then
+                 * we try the next slot in the hash table through linear probing.
+                 */
+                slotIndex = (slotIndex + 1) % NUM_SLOTS;
+                vectorOffset = unsafe.getShort(slotsAddress + slotIndex * 2);
+            }
+
+            /**
+             * If the slot was unoccupied, then we can initialise it with the
+             * city name and measurement.
+             */
+            if (vectorOffset == 0) {
+                /**
+                 * Record where the city name length is recorded for this slot.
+                 */
+                unsafe.putByte(lengthsAddress + slotIndex, (byte) nameLength);
+
+                /**
+                 * Record where the start of the set of vectors are recorded for
+                 */
+                unsafe.putShort(slotsAddress + slotIndex * 2, nextVectorIndex);
+
+                /**
+                 * Records the vectors for the city name.
+                 */
+                for (int v = 0; v < numVectors; v++) {
+                    vectors[nextVectorIndex] = ByteVector.fromArray(SPECIES, buffer, nameStart + v * SPECIES.length());
+                    nextVectorIndex++;
+                }
+
+                cityVectorLookup.put(new String(buffer, nameStart, nameLength), slotIndex);
+
+                /**
+                 * Min, max, count, sum
+                 */
+                var numbersIndex = getNumbersIndex(slotIndex);
+                unsafe.putInt(numbersIndex, measurement);
+                unsafe.putInt(numbersIndex + 4, measurement);
+                unsafe.putInt(numbersIndex + 8, 1);
+                unsafe.putInt(numbersIndex + 12, measurement);
+
+                cityNames[slotIndex] = new String(buffer, nameStart, nameLength);
+            }
+        }
+
+        /**
+         * Given the index bounds of a name in a buffer, creates a hash of the name
+         * by multiplying the first twelve characters. This was experimentally found
+         * to provide a good distribution of hash values for the test data. In
+         * combination with the number of slots in the hash table, this produces no
+         * collisions on the test data. The test data is a hint, but the correctness
+         * of the implementation is not coupled to the test data because the hash
+         * table is able to handle collisions in other arbitrary source data.
+         */
+        private final int nameToSlotIndex(byte[] buffer, int nameStart, int nameLength) {
+            var integer = 1;
+            integer *= buffer[nameStart + 0];
+            if (nameLength > 1) {
+                integer *= buffer[nameStart + 1];
+                if (nameLength > 2) {
+                    integer *= buffer[nameStart + 2];
+                    if (nameLength > 3) {
+                        integer *= buffer[nameStart + 3];
+                        if (nameLength > 4) {
+                            integer *= buffer[nameStart + 4];
+                            if (nameLength > 5) {
+                                integer *= buffer[nameStart + 5];
+                                if (nameLength > 6) {
+                                    integer *= buffer[nameStart + 6];
+                                    if (nameLength > 7) {
+                                        integer *= buffer[nameStart + 7];
+                                        if (nameLength > 8) {
+                                            integer *= buffer[nameStart + 8];
+                                            if (nameLength > 9) {
+                                                integer *= buffer[nameStart + 9];
+                                                if (nameLength > 10) {
+                                                    integer *= buffer[nameStart + 10];
+                                                    if (nameLength > 11) {
+                                                        integer *= buffer[nameStart + 11];
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            return Math.abs(integer) % NUM_SLOTS;
+        }
+
+        /**
+         * Given a slot index and a measurement, updates the aggregation of the
+         * measurements for the city in that slot.
+         */
+        private final void updateSlot(int slotIndex, int measurement) {
+            var numbersIndex = getNumbersIndex(slotIndex);
+            var min = unsafe.getInt(numbersIndex);
+            var max = unsafe.getInt(numbersIndex + 4);
+            var count = unsafe.getInt(numbersIndex + 8);
+            var sum = unsafe.getInt(numbersIndex + 12);
+
+            unsafe.putInt(numbersIndex, Math.min(min, measurement));
+            unsafe.putInt(numbersIndex + 4, Math.max(max, measurement));
+            unsafe.putInt(numbersIndex + 8, count + 1);
+            unsafe.putInt(numbersIndex + 12, sum + measurement);
+        }
+
+        /**
+         * Given a name in a buffer, a slot index, and a number of vectors, checks
+         * the equality of the name and the slot.
+         * 
+         * The length of the name is not necessarily a multiple of the SIMD vector
+         * length, so the last vector in the slot will be a partial vector. The
+         * masks are used to ignore the unused bytes in the last vector.
+         */
+        private final boolean slotEquals(byte[] buffer, int nameStart, int vectorOffset, int numVectors, int remainder, int slotIndex) {
+            for (int v = 0; v < numVectors; v++) {
+                var nameVector = ByteVector.fromArray(SPECIES, buffer, nameStart + v * SPECIES.length());
+                var slotVector = vectors[vectorOffset + v];
+                if (v == numVectors - 1) {
+                    if (remainder == 0) {
+                        if (!slotVector.eq(nameVector).allTrue()) {
+                            return false;
+                        }
+                    }
+                    else {
+                        var mask = MASKS[remainder - 1];
+                        if (!slotVector.compare(VectorOperators.EQ, nameVector, mask).equals(mask)) {
+                            return false;
+                        }
+                    }
+                    break;
+                }
+                else {
+                    if (!slotVector.eq(nameVector).allTrue()) {
+                        return false;
+                    }
+                }
+            }
+
+            return true;
+        }
+
+        /**
+         * Given a slot index, returns the main memory address of the integer table
+         * where the min, max, count and sum of the measurements are stored.
+         */
+        private final long getNumbersIndex(int slotIndex) {
+            return numbersAddress + slotIndex * 16;
+        }
+
+        public void collectResults(TreeMap<String, CityResult> results) {
+            for (var entry : cityVectorLookup.entrySet()) {
+                var city = entry.getKey();
+                var slotIndex = entry.getValue();
+                var numbersIndex = getNumbersIndex(slotIndex);
+                var min = unsafe.getInt(numbersIndex);
+                var max = unsafe.getInt(numbersIndex + 4);
+                var count = unsafe.getInt(numbersIndex + 8);
+                var sum = unsafe.getInt(numbersIndex + 12);
+                results.compute(city, (k, v) -> {
+                    if (v == null) {
+                        return new CityResult(min, max, sum, count);
+                    }
+                    else {
+                        v.min = Math.min(v.min, min);
+                        v.max = Math.max(v.max, max);
+                        v.sum += sum;
+                        v.count += count;
+                        return v;
+                    }
+                });
+            }
+        }
+
+        /**
+         * Generates a lookup table of vector masks to use when comparing equality of
+         * the last vector of the source line and a given slot in the hash table.
+         */
+        private static final VectorMask<Byte>[] generateMasks(VectorSpecies<Byte> species) {
+            VectorMask<Byte>[] masks = new VectorMask[species.length() - 1];
+            masks[0] = VectorMask.fromArray(species, new boolean[]{ true, false, false, false, false, false, false, false }, 0);
+            masks[1] = VectorMask.fromArray(species, new boolean[]{ true, true, false, false, false, false, false, false }, 0);
+            masks[2] = VectorMask.fromArray(species, new boolean[]{ true, true, true, false, false, false, false, false }, 0);
+            masks[3] = VectorMask.fromArray(species, new boolean[]{ true, true, true, true, false, false, false, false }, 0);
+            masks[4] = VectorMask.fromArray(species, new boolean[]{ true, true, true, true, true, false, false, false }, 0);
+            masks[5] = VectorMask.fromArray(species, new boolean[]{ true, true, true, true, true, true, false, false }, 0);
+            masks[6] = VectorMask.fromArray(species, new boolean[]{ true, true, true, true, true, true, true, false }, 0);
+            return masks;
+        }
+
+        private static final Unsafe getUnsafe() {
+            Field field;
+            try {
+                field = Unsafe.class.getDeclaredField("theUnsafe");
+                field.setAccessible(true);
+                return (Unsafe) field.get(null);
+            }
+            catch (NoSuchFieldException | SecurityException | IllegalArgumentException | IllegalAccessException e) {
+                throw new RuntimeException("Failed to get unsafe", e);
+            }
+        }
+    }
+
+    static final class CityResult {
+        public int min;
+        public int max;
+        public int sum;
+        public int count;
+
+        public CityResult(int min, int max, int sum, int count) {
+            this.min = min;
+            this.max = max;
+            this.sum = sum;
+            this.count = count;
+        }
+    }
+
+    static final class FileRange {
+        public final long start;
+        public final long end;
+
+        public FileRange(long start, long end) {
+            this.start = start;
+            this.end = end;
+        }
+    }
+}
\ No newline at end of file

From 68e859d3eb8e17df41235c3165a0843e4326dcde Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 17:16:28 +0100
Subject: [PATCH 264/268] Leaderboard update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index f9c044b57..245a12a6c 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:06.654 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jbachorik.java)| 21.0.1-graal | [Jaroslav Bachorik](https://github.com/jbachorik) |  |
 |   | 00:06.715 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_algirdasrascius.java)| 21.0.1-open | [Algirdas Raščius](https://github.com/algirdasrascius) |  |
 |   | 00:06.884 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_rcasteltrione.java)| 21.0.1-graal | [rcasteltrione](https://github.com/rcasteltrione) |  |
+|   | 00:06.982 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ChrisBellew.java)| 21.0.1-open | [Chris Bellew](https://github.com/ChrisBellew) |  |
 |   | 00:07.563 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_3j5a.java)| 21.0.1-graal | [3j5a](https://github.com/3j5a) |  |
 |   | 00:07.680 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_C5H12O5.java)| 21.0.1-graal | [Xylitol](https://github.com/C5H12O5) | uses Unsafe |
 |   | 00:07.712 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_anitasv.java)| 21.0.1-graal | [Anita SV](https://github.com/anitasv) |  |

From a78c1fc973a9acdf0b683aff2842f5a24ed20ada Mon Sep 17 00:00:00 2001
From: Smoofie <62144827+Smoofie@users.noreply.github.com>
Date: Thu, 1 Feb 2024 19:32:54 +0100
Subject: [PATCH 265/268] Submission for Smoofie (#701)

* Smoofie solution. Kinda slow but it was fun :)

* Format according to mvn build

* Fix semicolon detection, which cause invalid temperature parsing and subsequently segmentation faults due to counter addressing
---
 calculate_average_Smoofie.sh                  |  19 +
 prepare_Smoofie.sh                            |  20 +
 .../onebrc/CalculateAverage_Smoofie.java      | 457 ++++++++++++++++++
 3 files changed, 496 insertions(+)
 create mode 100755 calculate_average_Smoofie.sh
 create mode 100755 prepare_Smoofie.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_Smoofie.java

diff --git a/calculate_average_Smoofie.sh b/calculate_average_Smoofie.sh
new file mode 100755
index 000000000..3688c3a34
--- /dev/null
+++ b/calculate_average_Smoofie.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+JAVA_OPTS="--enable-preview"
+java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_Smoofie
diff --git a/prepare_Smoofie.sh b/prepare_Smoofie.sh
new file mode 100755
index 000000000..4cda7b411
--- /dev/null
+++ b/prepare_Smoofie.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+# source "$HOME/.sdkman/bin/sdkman-init.sh"
+# sdk use java 21.0.1-graal 1>&2
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_Smoofie.java b/src/main/java/dev/morling/onebrc/CalculateAverage_Smoofie.java
new file mode 100644
index 000000000..4b7533744
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_Smoofie.java
@@ -0,0 +1,457 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import sun.misc.Unsafe;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.foreign.Arena;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.util.TreeMap;
+import java.util.concurrent.Executors;
+import java.util.stream.IntStream;
+
+public class CalculateAverage_Smoofie {
+
+    private static final String FILE = "./measurements.txt";
+    private static final Unsafe unsafe = getUnsafe();
+
+    private static class MeasurementAggregator {
+        private int min = -1000;
+        private int max = 1000;
+        private long sum = 0;
+        private int count = 0;
+
+        @Override
+        public String toString() {
+            return ((double) min) / 10 + "/" + round(sum / 10.0 / count) + "/" + ((double) max) / 10;
+        }
+
+        private double round(double value) {
+            return Math.round(value * 10.0) / 10.0;
+        }
+    }
+
+    private static final class CountResult {
+        private final long cityHashTableAddress;
+        private final long countsAddress;
+        private int cityIdCounter;
+        private long nextCollisionAddress;
+
+        private CountResult(
+
+                            // cityId|cityLength|cityNameAddress|nextElementAddress|cityCountsAddress
+                            long cityHashTableAddress,
+                            long countsAddress,
+                            int cityIdCounter,
+                            long nextCollisionAddress) {
+            this.cityHashTableAddress = cityHashTableAddress;
+            this.countsAddress = countsAddress;
+            this.cityIdCounter = cityIdCounter;
+            this.nextCollisionAddress = nextCollisionAddress;
+        }
+
+    }
+
+    private static int hash(long cityNameAddress, short cityLength) {
+        if (cityLength < 17) {
+            long[] city = new long[2];
+            unsafe.copyMemory(null, cityNameAddress, city, Unsafe.ARRAY_LONG_BASE_OFFSET, cityLength);
+            long hash = city[0] ^ (city[1] >> 1);
+            int foldedHash = (int) (hash ^ (hash >>> 31));
+            return (foldedHash & foldedHash >>> 15) & 0xffff;
+        }
+        else {
+            long[] city = new long[cityLength >> 3 + 1];
+            unsafe.copyMemory(null, cityNameAddress, city, Unsafe.ARRAY_LONG_BASE_OFFSET, cityLength);
+
+            long hash = city[0];
+            for (int i = 1; i < city.length; i++) {
+                hash ^= city[i];
+            }
+
+            int foldedHash = (int) (hash ^ (hash >>> 30));
+            return (foldedHash & foldedHash >>> 15) & 0xffff;
+        }
+    }
+
+    private static Unsafe getUnsafe() {
+        try {
+            var field = Unsafe.class.getDeclaredField("theUnsafe");
+            field.setAccessible(true);
+            return (Unsafe) field.get(null);
+        }
+        catch (NoSuchFieldException | IllegalAccessException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static long locateSemicolon(long input) {
+        long semiXor = input ^ 0x3B3B3B3B3B3B3B3BL;
+        return (semiXor - 0x0101010101010101L) & ~semiXor & 0x8080808080808080L;
+    }
+
+    public static void main(String[] args) throws IOException, InterruptedException {
+        var numberOfThreads = Runtime.getRuntime().availableProcessors();
+        var executorService = Executors.newFixedThreadPool(numberOfThreads);
+        var resultMap = new TreeMap<String, MeasurementAggregator>();
+        var subCountResults = new CountResult[numberOfThreads];
+
+        try (RandomAccessFile randomAccessFile = new RandomAccessFile(FILE, "r");
+                FileChannel fileChannel = randomAccessFile.getChannel()) {
+
+            long fileSize = randomAccessFile.length();
+            if (fileSize < numberOfThreads * 1024) {
+                numberOfThreads = fileSize < 1024 ? 1 : (int) (fileSize / 1024);
+            }
+            long chunkSize = fileSize / numberOfThreads;
+
+            long inputFileAddress = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileSize, Arena.global()).address();
+            final long[] inputFileMemoryOffsets = new long[numberOfThreads + 1];
+            inputFileMemoryOffsets[0] = inputFileAddress;
+            inputFileMemoryOffsets[numberOfThreads] = inputFileAddress + fileSize;
+            for (long i = inputFileAddress + chunkSize, index = 1; index < numberOfThreads; i += chunkSize, index++) {
+                while (unsafe.getByte(i++) != '\n')
+                    ;
+                inputFileMemoryOffsets[(int) index] = i;
+            }
+
+            for (int i = 0; i < numberOfThreads; i++) {
+                final long start = inputFileMemoryOffsets[i];
+                final long end = inputFileMemoryOffsets[i + 1];
+
+                final int threadIndex = i;
+                executorService.execute(() -> {
+                    var cityHashTableAddress = unsafe.allocateMemory(75536 * 32);
+                    unsafe.setMemory(cityHashTableAddress, 75536 * 32, (byte) 0);
+                    long nextCollisionAddress = cityHashTableAddress + (65536 << 5);
+
+                    var countsAddress = unsafe.allocateMemory(10000 * 2 * 1000 * 4);
+                    int cityId;
+                    int temperature = 0;
+                    int cityIdCounter = 0;
+                    long position = start;
+                    byte c;
+                    long input;
+                    long inputSemicolon;
+                    int cityHash;
+                    long hashAddress;
+                    short cityLength;
+                    long cityStart;
+                    long temperatureAddress;
+                    while (position < end) {
+                        cityStart = position;
+                        input = unsafe.getLong(position);
+                        inputSemicolon = locateSemicolon(input);
+                        if (inputSemicolon == 0) {
+                            position += 8;
+                            input = unsafe.getLong(position);
+                            inputSemicolon = locateSemicolon(input);
+
+                            if (inputSemicolon == 0) {
+                                // probably not gonna happen very often
+                                while (inputSemicolon == 0) {
+                                    position += 8;
+                                    input = unsafe.getLong(position);
+                                    inputSemicolon = locateSemicolon(input);
+                                }
+                            }
+                        }
+                        position += Long.numberOfTrailingZeros(inputSemicolon) >> 3;
+
+                        cityLength = (short) (position - cityStart);
+
+                        cityHash = hash(cityStart, cityLength);
+                        hashAddress = cityHashTableAddress + ((long) cityHash << 5);
+                        cityId = -1;
+                        outer: for (;;) {
+                            if (cityLength != unsafe.getShort(hashAddress + 4)) {
+                                if (unsafe.getShort(hashAddress + 4) == 0) {
+                                    // new hash slot init
+                                    cityId = cityIdCounter++;
+                                    unsafe.setMemory(countsAddress + cityId * 8000, 8000, (byte) 0);
+                                    unsafe.putInt(hashAddress, cityId);
+                                    unsafe.putShort(hashAddress + 4, cityLength);
+                                    unsafe.putLong(hashAddress + 6, cityStart);
+                                    unsafe.putLong(hashAddress + 22, countsAddress + cityId * 8000);
+                                    break;
+                                }
+                                if (unsafe.getLong(hashAddress + 14) != 0) {
+                                    hashAddress = unsafe.getLong(hashAddress + 14);
+                                    continue;
+                                }
+                                break;
+                            }
+                            long cityNameAddress = unsafe.getLong(hashAddress + 6);
+                            int j;
+                            for (j = 0; j < cityLength >> 3 << 3; j += 8) {
+                                if (unsafe.getLong(cityStart + j) != unsafe.getLong(cityNameAddress + j)) {
+                                    if (unsafe.getLong(hashAddress + 14) != 0) {
+                                        hashAddress = unsafe.getLong(hashAddress + 14);
+                                        continue outer;
+                                    }
+                                    break outer;
+                                }
+                            }
+                            if (j < cityLength) {
+                                if ((unsafe.getLong(cityStart + j) << ((0x8 - cityLength & 0x7) << 3)) != (unsafe
+                                        .getLong(cityNameAddress + j) << ((0x8 - cityLength & 0x7) << 3))) {
+                                    if (unsafe.getLong(hashAddress + 14) != 0) {
+                                        hashAddress = unsafe.getLong(hashAddress + 14);
+                                        continue;
+                                    }
+                                    break;
+                                }
+                            }
+                            cityId = unsafe.getInt(hashAddress);
+                            break;
+                        }
+
+                        if (cityId == -1) {
+                            // collision
+                            cityId = cityIdCounter++;
+                            unsafe.setMemory(countsAddress + cityId * 8000, 8000, (byte) 0);
+                            unsafe.putLong(hashAddress + 14, nextCollisionAddress);
+                            hashAddress = nextCollisionAddress;
+                            nextCollisionAddress += 32;
+                            unsafe.putInt(hashAddress, cityId);
+                            unsafe.putShort(hashAddress + 4, cityLength);
+                            unsafe.putLong(hashAddress + 6, cityStart);
+                            unsafe.putLong(hashAddress + 22, countsAddress + cityId * 8000);
+                        }
+
+                        position++; // skip semicolon
+
+                        // long inputDecimalPoint = locateDecimalPoint(unsafe.getLong(position));
+                        // position += (Long.numberOfTrailingZeros(inputDecimalPoint) >> 3) + 3;
+
+                        temperature = 0;
+                        c = unsafe.getByte(position++);
+                        if (c == '-') {
+                            while ((c = unsafe.getByte(position++)) != '\n') {
+                                if (c != '.') {
+                                    temperature = temperature * 10 + (c ^ 0x30);
+                                }
+                            }
+                            temperatureAddress = unsafe.getLong(hashAddress + 22) + (1000 + temperature) * 4;
+                            unsafe.putInt(temperatureAddress, unsafe.getInt(temperatureAddress) + 1);
+                        }
+                        else {
+                            temperature = c - '0';
+                            while ((c = unsafe.getByte(position++)) != '\n') {
+                                if (c != '.') {
+                                    temperature = temperature * 10 + (c ^ 0x30);
+                                }
+                            }
+
+                            temperatureAddress = unsafe.getLong(hashAddress + 22) + temperature * 4;
+                            unsafe.putInt(temperatureAddress, unsafe.getInt(temperatureAddress) + 1);
+                        }
+                    }
+                    subCountResults[threadIndex] = new CountResult(cityHashTableAddress, countsAddress, cityIdCounter, nextCollisionAddress);
+                });
+            }
+
+            executorService.shutdown();
+            executorService.awaitTermination(120, java.util.concurrent.TimeUnit.SECONDS);
+
+            // aggregate results 1..n to 0
+            var subCountA = subCountResults[0];
+            for (int r = 1; r < numberOfThreads; r++) {
+                CountResult subCountB = subCountResults[r];
+                for (int i = 0; i < 65536; i++) {
+                    long bHashAddress = subCountB.cityHashTableAddress + ((long) i << 5);
+                    if (unsafe.getShort(bHashAddress + 4) == 0) {
+                        continue;
+                    }
+                    long aHashAddress = subCountA.cityHashTableAddress + ((long) i << 5);
+                    // check if a initialized
+                    if (unsafe.getShort(aHashAddress + 4) == 0) {
+                        // new hash slot init
+                        for (long addressA = aHashAddress, addressB = bHashAddress; addressB != 0;) {
+                            unsafe.putInt(addressA, subCountA.cityIdCounter++);
+                            unsafe.putShort(addressA + 4, unsafe.getShort(addressB + 4));
+                            unsafe.putLong(addressA + 6, unsafe.getLong(addressB + 6));
+                            addressB = unsafe.getLong(addressB + 14);
+                            if (addressB != 0) {
+                                unsafe.putLong(addressA + 14, subCountA.nextCollisionAddress);
+                                addressA = subCountA.nextCollisionAddress;
+                                subCountA.nextCollisionAddress += 32;
+                            }
+                        }
+                    }
+                    else {
+                        // check to copy collision list too
+                        outerB: for (long addressB = bHashAddress; addressB != 0; addressB = unsafe.getLong(addressB + 14)) {
+                            short cityLength = unsafe.getShort(addressB + 4);
+                            long cityNameAddress = unsafe.getLong(addressB + 6);
+                            // compare to each city in A slot
+                            outerA: for (long aAddress = aHashAddress; aAddress != 0; aAddress = unsafe.getLong(aAddress + 14)) {
+                                if (unsafe.getShort(aAddress + 4) == cityLength) {
+                                    long aCityNameAddress = unsafe.getLong(aAddress + 6);
+                                    int j;
+                                    for (j = 0; j < cityLength >> 3 << 3; j += 8) {
+                                        if (unsafe.getLong(cityNameAddress + j) != unsafe.getLong(aCityNameAddress + j)) {
+                                            // nope, not the same, try next
+                                            continue outerA;
+                                        }
+                                    }
+                                    if (j == cityLength ||
+                                            (unsafe.getLong(cityNameAddress + j) << ((0x8 - cityLength & 0x7) << 3)) == (unsafe
+                                                    .getLong(aCityNameAddress + j) << ((0x8 - cityLength & 0x7) << 3))) {
+                                        // found the same city, continue with next city in B slot
+                                        continue outerB;
+                                    }
+                                }
+                            }
+                            // city not found in A slot, add it. It's a collision too
+                            long addressA = aHashAddress;
+                            while (unsafe.getLong(addressA + 14) != 0) {
+                                addressA = unsafe.getLong(addressA + 14);
+                            }
+                            unsafe.putLong(addressA + 14, subCountA.nextCollisionAddress);
+                            addressA = subCountA.nextCollisionAddress;
+                            subCountA.nextCollisionAddress += 32;
+
+                            unsafe.putInt(addressA, subCountA.cityIdCounter++);
+                            unsafe.putShort(addressA + 4, cityLength);
+                            unsafe.putLong(addressA + 6, cityNameAddress);
+                        }
+                    }
+                }
+
+                int[] cityIdMap = new int[10000];
+                for (int i = 0; i < 10000; i++) {
+                    cityIdMap[i] = -1;
+                }
+
+                for (int i = 0; i < 65536; i++) {
+                    long bHashAddress = subCountB.cityHashTableAddress + ((long) i << 5);
+                    long aHashAddress = subCountA.cityHashTableAddress + ((long) i << 5);
+                    if (unsafe.getShort(aHashAddress + 4) == 0) {
+                        continue;
+                    }
+                    // for each city in A slot
+                    outerA: for (long aAddress = aHashAddress; aAddress != 0; aAddress = unsafe.getLong(aAddress + 14)) {
+                        short cityLength = unsafe.getShort(aAddress + 4);
+                        long cityNameAddress = unsafe.getLong(aAddress + 6);
+                        int cityIdA = unsafe.getInt(aAddress);
+                        // compare to each city in B slot
+                        outer: for (long bAddress = bHashAddress; bAddress != 0; bAddress = unsafe.getLong(bAddress + 14)) {
+                            if (unsafe.getShort(bAddress + 4) == cityLength) {
+                                long bCityNameAddress = unsafe.getLong(bAddress + 6);
+                                int j;
+                                for (j = 0; j < cityLength >> 3 << 3; j += 8) {
+                                    if (unsafe.getLong(cityNameAddress + j) != unsafe.getLong(bCityNameAddress + j)) {
+                                        // nope, not the same, try next
+                                        continue outer;
+                                    }
+                                }
+                                if (j == cityLength ||
+                                        (unsafe.getLong(cityNameAddress + j) << ((0x8 - cityLength & 0x7) << 3)) == (unsafe
+                                                .getLong(bCityNameAddress + j) << ((0x8 - cityLength & 0x7) << 3))) {
+                                    cityIdMap[cityIdA] = unsafe.getInt(bAddress);
+                                    // found the same city, continue with next city in A slot
+                                    continue outerA;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                for (int i = 0; i < subCountA.cityIdCounter; i++) {
+                    int cityId2 = cityIdMap[i];
+                    if (cityId2 != -1) {
+                        for (int j = 0; j < 2; j++) {
+                            for (int k = 0; k < 1000; k++) {
+                                unsafe.putInt(subCountA.countsAddress + i * 8000 + j * 4000 + k * 4,
+                                        unsafe.getInt(subCountA.countsAddress + i * 8000 + j * 4000 + k * 4) +
+                                                unsafe.getInt(subCountB.countsAddress + cityId2 * 8000 + j * 4000 + k * 4));
+                            }
+                        }
+                    }
+                }
+            }
+
+            var countResult = subCountResults[0];
+            var reverseCityIds = new String[10000];
+            for (int i = 0; i < 65536; i++) {
+                long resultHashAddress = countResult.cityHashTableAddress + ((long) i << 5);
+                if (unsafe.getShort(resultHashAddress + 4) != 0) {
+                    for (long address = resultHashAddress; address != 0; address = unsafe.getLong(address + 14)) {
+                        int cityId = unsafe.getInt(address);
+                        int cityLength = unsafe.getShort(address + 4);
+                        long cityNameAddress = unsafe.getLong(address + 6);
+                        byte[] cityBytes = new byte[cityLength];
+                        unsafe.copyMemory(null, cityNameAddress, cityBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, cityLength);
+                        reverseCityIds[cityId] = new String(cityBytes, StandardCharsets.UTF_8);
+                    }
+                }
+            }
+
+            // count result as stream
+            IntStream.range(0, 10000).parallel().forEach(cityId -> {
+                var cityName = reverseCityIds[cityId];
+                if (cityName == null) {
+                    return;
+                }
+                var cityAddress = countResult.countsAddress + cityId * 8000;
+                var cityResult = new MeasurementAggregator();
+                for (int i = 999; i > -1; i--) {
+                    if (unsafe.getInt(cityAddress + 4000 + i * 4) > 0) {
+                        cityResult.min = -i;
+                        break;
+                    }
+                }
+                if (cityResult.min == -1000) {
+                    for (int i = 0; i < 1000; i++) {
+                        if (unsafe.getInt(cityAddress + i * 4) > 0) {
+                            cityResult.min = i;
+                            break;
+                        }
+                    }
+                }
+                for (int i = 999; i > -1; i--) {
+                    if (unsafe.getInt(cityAddress + i * 4) > 0) {
+                        cityResult.max = i;
+                        break;
+                    }
+                }
+                if (cityResult.max == 1000) {
+                    for (int i = 0; i < 1000; i++) {
+                        if (unsafe.getInt(cityAddress + 4000 + i * 4) > 0) {
+                            cityResult.max = -i;
+                            break;
+                        }
+                    }
+                }
+                for (int i = 0; i < 1000; i++) {
+                    cityResult.sum += ((long) unsafe.getInt(cityAddress + i * 4)) * i;
+                    cityResult.sum -= ((long) unsafe.getInt(cityAddress + 4000 + i * 4)) * i;
+                    cityResult.count += unsafe.getInt(cityAddress + i * 4);
+                    cityResult.count += unsafe.getInt(cityAddress + 4000 + i * 4);
+                }
+                synchronized (resultMap) {
+                    resultMap.put(cityName, cityResult);
+                }
+            });
+
+            System.out.println(resultMap);
+        }
+    }
+}

From ba20cd8439fdcdcd8c33fb6d3f9532afc07ade52 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Thu, 1 Feb 2024 19:33:22 +0100
Subject: [PATCH 266/268] Leaderboard update.

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 245a12a6c..07248a9e9 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:07.730 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_jotschi.java)| 21.0.1-open | [Johannes Schüth](https://github.com/jotschi) |  |
 |   | 00:07.894 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_tonivade.java)| 21.0.2-tem | [Antonio Muñoz](https://github.com/tonivade) |  |
 |   | 00:07.925 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
+|   | 00:07.948 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_Smoofie.java)| java | [Smoofie](https://github.com/Smoofie) | uses Unsafe |
 |   | 00:08.157 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_JurenIvan.java)| 21.0.1-open | [JurenIvan](https://github.com/JurenIvan) |  |
 |   | 00:08.167 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ddimtirov.java)| 21.0.1-tem | [Dimitar Dimitrov](https://github.com/ddimtirov) |  |
 |   | 00:08.214 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_deemkeen.java)| 21.0.1-open | [deemkeen](https://github.com/deemkeen) |  |

From f02279df8c48a68acf56e138f400ad978ce1b047 Mon Sep 17 00:00:00 2001
From: Martin <shangqingxiaai@foxmail.com>
Date: Sat, 3 Feb 2024 04:04:30 +0800
Subject: [PATCH 267/268] martin2038: first submission (#665)

* first double as int

* - hashcode

* JAVA_OPTS empty

* native

* native

* CalculateAverage_melgenek
https://questdb.io/blog/building-faster-hash-table-high-performance-sql-joins/#fastmap-internals

* mvn formatting

* jvm model

* 10k name

* 10k name

* round mean

* limit ChunkSize  smaller than Integer.MAX_VALUE

---------

Co-authored-by: martin.cong <martin.cong@zhulinkeji.com>
---
 calculate_average_martin2038.sh               |  30 ++
 prepare_martin2038.sh                         |  26 ++
 .../onebrc/CalculateAverage_martin2038.java   | 337 ++++++++++++++++++
 3 files changed, 393 insertions(+)
 create mode 100755 calculate_average_martin2038.sh
 create mode 100755 prepare_martin2038.sh
 create mode 100644 src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java

diff --git a/calculate_average_martin2038.sh b/calculate_average_martin2038.sh
new file mode 100755
index 000000000..c141e2bdb
--- /dev/null
+++ b/calculate_average_martin2038.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+if [ -f target/CalculateAverage_martin2038_image ]; then
+    echo "Picking up existing native image 'target/CalculateAverage_martin2038_image', delete the file to select JVM mode." 1>&2
+    target/CalculateAverage_martin2038_image
+else
+    
+    #JAVA_OPTS="--enable-preview"
+    echo "Chosing to run the app in JVM mode as no native image was found, use prepare_martin2038.sh to generate." 1>&2 
+    # JAVA_OPTS="-XX:-EnableJVMCI -Xms16g -Xmx16g -XX:+AlwaysPreTouch -XX:+UnlockExperimentalVMOptions -XX:+UseEpsilonGC"
+    JAVA_OPTS=""
+    java $JAVA_OPTS --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CalculateAverage_martin2038
+
+fi
diff --git a/prepare_martin2038.sh b/prepare_martin2038.sh
new file mode 100755
index 000000000..cf8e83f77
--- /dev/null
+++ b/prepare_martin2038.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#
+#  Copyright 2023 The original authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# Uncomment below to use sdk
+source "$HOME/.sdkman/bin/sdkman-init.sh"
+sdk use java 21.0.2-graal 1>&2
+##
+#if [ ! -f target/CalculateAverage_martin2038 ]; then
+#    MAIN=dev.morling.onebrc.CalculateAverage_martin2038
+#    NATIVE_IMAGE_OPTS="-H:+UnlockExperimentalVMOptions --initialize-at-build-time=$MAIN --gc=epsilon -O3 -march=native -R:MaxHeapSize=515m -H:-GenLoopSafepoints -H:InlineAllBonus=10 -H:-ParseRuntimeOptions"
+#    native-image $NATIVE_IMAGE_OPTS -cp target/average-1.0.0-SNAPSHOT.jar -o target/CalculateAverage_martin2038_image $MAIN
+#fi
\ No newline at end of file
diff --git a/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java b/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java
new file mode 100644
index 000000000..073f157c3
--- /dev/null
+++ b/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java
@@ -0,0 +1,337 @@
+/*
+ *  Copyright 2023 The original authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package dev.morling.onebrc;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.VarHandle;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel.MapMode;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class CalculateAverage_martin2038 {
+
+    // private static final String FILE = "/Users/martin/Garden/blog/1BRC/1brc/./measurements.txt";
+
+    private static final String FILE = "./measurements.txt";
+
+    private static class MeasurementAggregator {
+        private int min = Integer.MAX_VALUE;
+        private int max = Integer.MIN_VALUE;
+        private long sum;
+        private int count;
+
+        void update(int temp) {
+            update(1, temp, temp, temp);
+        }
+
+        void update(int cnt, long sm, int min, int max) {
+            sum += sm;
+            count += cnt;
+            if (this.min > min) {
+                this.min = min;
+            }
+            if (this.max < max) {
+                this.max = max;
+            }
+        }
+
+        void merge(MeasurementAggregator it) {
+            update(it.count, it.sum, it.min, it.max);
+        }
+
+        public String toString() {
+            var mean = this.sum / 10.0 / this.count;
+            return (min / 10f) + "/" + Math.round(mean * 10) / 10f + "/" + (max / 10f);
+        }
+    }
+
+    public static void main(String[] args) throws IOException {
+
+        var file = new RandomAccessFile(FILE, "r");
+        final int maxNameLength = 110;
+        var fc = file.getChannel();
+        split(file).stream().parallel().map(ck -> {
+            // StrFastHashKey 比string快500ms
+            var map = new HashMap<StrFastHashKey, MeasurementAggregator>(200);
+            // var pb = System.currentTimeMillis();
+            try {
+                var mb = fc.map(MapMode.READ_ONLY, ck.start, ck.length);
+                var buff = new byte[maxNameLength];
+                while (mb.hasRemaining()) {
+                    var name = readNextHashKey(buff, mb);
+                    // var name = readNextString(buff, mb);// .intern();
+                    var temp = readNextInt10Times(buff, mb);
+                    add2map(map, name, temp);
+                }
+                // long end = ck.start + ck.length;
+                // do {
+                // var name = readNext(file, ';', 30).intern();
+                // var temp = Double.parseDouble(readNext(file, '\n', 6));
+                // var agg = map.computeIfAbsent(name,it->new MeasurementAggregator());
+                // agg.update(temp);
+                // }while (file.getFilePointer()<end);
+            }
+            catch (IOException | NumberFormatException e) {
+                throw new RuntimeException(e);
+            }
+            // System.out.println("chunk end , cost : " + (System.currentTimeMillis() - pb));
+            return map;
+        }).reduce(CalculateAverage_martin2038::reduceMap).ifPresent(map -> {
+
+            var sb = new StringBuilder(map.size() * 100);
+            sb.append('{');
+            map.entrySet().stream().sorted(Map.Entry.comparingByKey())
+                    .forEachOrdered(kv -> sb.append(kv.getKey()).append('=').append(kv.getValue()).append(", "));
+            sb.deleteCharAt(sb.length() - 1);
+            sb.setCharAt(sb.length() - 1, '}');
+            var resultStr = sb.toString();
+            System.out.println(resultStr);
+            // System.out.println(resultStr.hashCode());
+        });
+
+    }
+
+    static <Key> HashMap<Key, MeasurementAggregator> reduceMap(HashMap<Key, MeasurementAggregator> aMap, HashMap<Key, MeasurementAggregator> bMap) {
+        aMap.forEach((k, v) -> {
+            var b = bMap.get(k);
+            if (null == b) {
+                bMap.put(k, v);
+            }
+            else {
+                b.merge(v);
+            }
+        });
+        return bMap;
+    }
+
+    static <Key> void add2map(Map<Key, MeasurementAggregator> map, Key name, int temp) {
+        // 比computeIfAbsent 节约1秒
+        var agg = map.get(name);
+        if (null == agg) {
+            agg = new MeasurementAggregator();
+            map.put(name, agg);
+        }
+        // var agg = map.computeIfAbsent(name,it->new MeasurementAggregator());
+        agg.update(temp);
+    }
+
+    record FileChunk(long start, long length) {
+    }
+
+    static List<FileChunk> split(RandomAccessFile file) throws IOException {
+        long total = file.length();
+        var threadNum = Math.max((int) (total / Integer.MAX_VALUE + 1), Runtime.getRuntime().availableProcessors());
+        long avgChunkSize = total / threadNum;
+        // System.out.println(avgChunkSize +" \t avgChunkSize : INT/MAX \t"+Integer.MAX_VALUE);
+        // Exception in thread "main" java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
+        // at java.base/sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:1183)
+        long lastStart = 0;
+        var list = new ArrayList<FileChunk>(threadNum);
+        for (var i = 0; i < threadNum - 1; i++) {
+            var length = avgChunkSize;
+            file.seek(lastStart + length);
+            while (file.readByte() != '\n') {
+                // file.seek(lastStart+ ++length);
+                ++length;
+            }
+            // include the '\n'
+            length++;
+            list.add(new FileChunk(lastStart, length));
+            lastStart += length;
+            if (lastStart >= total) {
+                return list;
+            }
+        }
+        list.add(new FileChunk(lastStart, total - lastStart));
+        return list;
+    }
+
+    static StrFastHashKey readNextHashKey(byte[] buf, MappedByteBuffer mb) {
+        int i = 1;
+        mb.get(buf, 0, i);
+        byte b;
+        while ((b = mb.get()) != ';') {
+            buf[i++] = b;
+        }
+        return new StrFastHashKey(buf, i);
+    }
+
+    static String readNextString(byte[] buf, MappedByteBuffer mb) {
+        int i = 1;
+        mb.get(buf, 0, i);
+        byte b;
+        while ((b = mb.get()) != ';') {
+            buf[i++] = b;
+        }
+        return new String(buf, 0, i);
+    }
+
+    // copy from CalculateAverage_3j5a
+    // 替换 Double.parse
+    // 时间 38秒 -> 5418 ms
+    static int readNextInt10Times(byte[] buf, MappedByteBuffer mb) {
+        final int min_number_len = 3;
+        int i = min_number_len;
+        mb.get(buf, 0, i);
+        byte b;
+        while ((b = mb.get()) != '\n') {
+            buf[i++] = b;
+        }
+        // -3.2
+        var zeroAscii = '0';
+        int temperature = buf[--i] - zeroAscii;
+        i--; // skipping dot
+        var base = 10;
+        while (i > 0) {
+            b = buf[--i];
+            if (b == '-') {
+                temperature = -temperature;
+            }
+            else {
+                temperature = base * (b - zeroAscii) + temperature;
+                base *= base;
+            }
+        }
+        return temperature;
+    }
+
+    // static String readNext(RandomAccessFile file, char endFlag,int initLength) throws IOException {
+    // StringBuilder input = new StringBuilder(initLength);
+    // int c = -1;
+    // //boolean eol = false;
+    //
+    // while (true) {
+    // c = file.read();
+    // if( c == endFlag || c == -1) {
+    // break;
+    // }
+    // input.append((char)c);
+    // }
+    //
+    // //if ((c == -1) && (input.length() == 0)) {
+    // // return null;
+    // //}
+    // return input.toString();
+    // }
+
+    static class StrFastHashKey implements Comparable<StrFastHashKey> {
+        final byte[] name;
+        final int hash;
+
+        String nameStr;
+
+        StrFastHashKey(byte[] buf, int size) {
+            name = new byte[size];
+            System.arraycopy(buf, 0, name, 0, size);
+            // hash = calculateHash(name, 0, size - 1);
+            // FNV1a save 100+ms than calculateHash
+            hash = hashFNV1a(name, size);
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            // if (this == o) {return true;}
+            // if (o == null || getClass() != o.getClass()) {return false;}
+            StrFastHashKey that = (StrFastHashKey) o;
+            return hash == that.hash && Arrays.equals(name, that.name);
+        }
+
+        @Override
+        public int hashCode() {
+            return hash;
+        }
+
+        @Override
+        public String toString() {
+            if (null == nameStr) {
+                nameStr = new String(name);
+            }
+            return nameStr;
+        }
+
+        @Override
+        public int compareTo(StrFastHashKey o) {
+            return toString().compareTo(o.toString());
+        }
+    }
+
+    private static final VarHandle LONG_VIEW = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.nativeOrder())
+            .withInvokeExactBehavior();
+    private static final VarHandle INT_VIEW = MethodHandles.byteArrayViewVarHandle(int[].class, ByteOrder.nativeOrder())
+            .withInvokeExactBehavior();
+
+    /**
+     * This is a prime number that gives pretty
+     * <a href="https://vanilla-java.github.io/2018/08/15/Looking-at-randomness-and-performance-for-hash-codes.html">good hash distributions</a>
+     * on the data in this challenge.
+     */
+    private static final long RANDOM_PRIME = 0x7A646E4D;
+
+    /**
+     * The hash calculation is inspired by
+     * <a href="https://questdb.io/blog/building-faster-hash-table-high-performance-sql-joins/#fastmap-internals">QuestDB FastMap</a>
+     */
+    private static int calculateHash(byte[] buffer, int startPosition, int endPosition) {
+        long hash = 0;
+
+        int position = startPosition;
+        for (; position + Long.BYTES <= endPosition; position += Long.BYTES) {
+            long value = (long) LONG_VIEW.get(buffer, position);
+            hash = hash * RANDOM_PRIME + value;
+        }
+
+        if (position + Integer.BYTES <= endPosition) {
+            int value = (int) INT_VIEW.get(buffer, position);
+            hash = hash * RANDOM_PRIME + value;
+            position += Integer.BYTES;
+        }
+
+        for (; position <= endPosition; position++) {
+            hash = hash * RANDOM_PRIME + buffer[position];
+        }
+        hash = hash * RANDOM_PRIME;
+        return (int) hash ^ (int) (hash >>> 32);
+    }
+
+    private static final int FNV1_32_INIT = 0x811c9dc5;
+    private static final int FNV1_PRIME_32 = 16777619;
+
+    /**
+     * https://github.com/prasanthj/hasher/blob/master/src/main/java/hasher/FNV1a.java
+     *
+     * FNV1a 32 bit variant.
+     *
+     * @param data   - input byte array
+     * @param length - length of array
+     * @return - hashcode
+     */
+    public static int hashFNV1a(byte[] data, int length) {
+        int hash = FNV1_32_INIT;
+        for (int i = 0; i < length; i++) {
+            hash ^= (data[i] & 0xff);
+            hash *= FNV1_PRIME_32;
+        }
+
+        return hash;
+    }
+}

From d2fef8844da3e986c1b9b6ce297fb379d74b11e5 Mon Sep 17 00:00:00 2001
From: Gunnar Morling <gunnar.morling@googlemail.com>
Date: Fri, 2 Feb 2024 21:09:01 +0100
Subject: [PATCH 268/268] Leaderboard update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 07248a9e9..9607f1636 100644
--- a/README.md
+++ b/README.md
@@ -133,6 +133,7 @@ These are the results from running all entries into the challenge on eight cores
 |   | 00:09.020 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_yemreinci.java)| 21.0.1-open | [yemreinci](https://github.com/yemreinci) |  |
 |   | 00:09.071 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_gabrielreid.java)| 21.0.1-open | [Gabriel Reid](https://github.com/gabrielreid) |  |
 |   | 00:09.352 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_filiphr.java)| 21.0.1-graal | [Filip Hrisafov](https://github.com/filiphr) |  |
+|   | 00:09.725 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_martin2038.java)| 21.0.2-graal | [Martin](https://github.com/martin2038) | GraalVM native binary |
 |   | 00:09.867 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_ricardopieper.java)| 21.0.1-graal | [Ricardo Pieper](https://github.com/ricardopieper) |  |
 |   | 00:09.945 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_japplis.java)| 21.0.1-open | [Anthony Goubard](https://github.com/japplis) |  |
 |   | 00:10.092 | [link](https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CalculateAverage_phd3.java)| 21.0.1-graal | [Pratham](https://github.com/phd3) |  |