Update Results for DataFusion 51.0.0

alamb · alamb · commit f9c26547bd02 · 2025-11-24T10:17:45.000-05:00
diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md
@@ -1,6 +1,9 @@
-# DataFusion
+# Apache DataFusion
 
-DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check <https://arrow.apache.org/datafusion/user-guide/introduction.html>
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check <https://arrow.apache.org/datafusion/user-guide/introduction.html>
+
+[Apache DataFusion]: https://arrow.apache.org/datafusion/
+[Apache Arrow]: https://arrow.apache.org/
 
 We use parquet file here and create an external table for it; and then execute the queries.
 
@@ -10,7 +13,7 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe
 
 1. manually start a AWS EC2 instance
     - `c6a.4xlarge`
-    - Ubuntu 22.04 or later
+    - Ubuntu 24.04 or later
     - Root 500GB gp2 SSD
     - no EBS optimized
     - no instance store
@@ -20,16 +23,16 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe
 1. `vi benchmark.sh` and modify following line to target Datafusion version
 
     ```bash
-    git checkout 46.0.0
+    git checkout 51.0.0
     ```
 
 1. `bash benchmark.sh`
+1. `./save-result.sh c6a.4xlarge`
 
 ### Know Issues
 
 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`)
 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`)
-3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050
 
 ## Generate full human readable results (for debugging)
 
diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh
@@ -11,9 +11,9 @@ sudo apt-get update -y
 sudo apt-get install -y gcc
 
 echo "Install DataFusion main branch"
-git clone https://github.com/apache/arrow-datafusion.git
-cd arrow-datafusion/
-git checkout 47.0.0
+git clone https://github.com/apache/datafusion.git
+cd datafusion/
+git checkout 51.0.0
 CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli
 export PATH="`pwd`/target/release:$PATH"
 cd ..
diff --git a/datafusion-partitioned/result.csv b/datafusion-partitioned/result.csv
@@ -0,0 +1,129 @@
+1,1,0.108
+1,2,0.032
+1,3,0.031
+2,1,0.161
+2,2,0.054
+2,3,0.053
+3,1,0.307
+3,2,0.095
+3,3,0.098
+4,1,0.577
+4,2,0.112
+4,3,0.108
+5,1,1.160
+5,2,0.769
+5,3,0.757
+6,1,1.110
+6,2,0.829
+6,3,0.826
+7,1,0.112
+7,2,0.032
+7,3,0.032
+8,1,0.169
+8,2,0.056
+8,3,0.057
+9,1,1.099
+9,2,0.931
+9,3,0.914
+10,1,1.771
+10,2,1.007
+10,3,1.006
+11,1,0.667
+11,2,0.232
+11,3,0.236
+12,1,0.882
+12,2,0.257
+12,3,0.253
+13,1,1.204
+13,2,0.839
+13,3,0.833
+14,1,2.712
+14,2,1.391
+14,3,1.414
+15,1,1.228
+15,2,0.804
+15,3,0.813
+16,1,1.023
+16,2,0.870
+16,3,0.882
+17,1,2.751
+17,2,1.688
+17,3,1.681
+18,1,2.749
+18,2,1.683
+18,3,1.683
+19,1,5.618
+19,2,3.391
+19,3,3.380
+20,1,0.375
+20,2,0.103
+20,3,0.104
+21,1,10.142
+21,2,1.119
+21,3,1.114
+22,1,11.557
+22,2,1.381
+22,3,1.376
+23,1,22.326
+23,2,2.639
+23,3,2.549
+24,1,52.872
+24,2,9.353
+24,3,9.169
+25,1,0.390
+25,2,0.155
+25,3,0.165
+26,1,1.144
+26,2,0.261
+26,3,0.256
+27,1,0.380
+27,2,0.160
+27,3,0.157
+28,1,10.451
+28,2,1.511
+28,3,1.507
+29,1,9.596
+29,2,8.827
+29,3,9.053
+30,1,0.582
+30,2,0.430
+30,3,0.453
+31,1,3.205
+31,2,0.791
+31,3,0.802
+32,1,6.970
+32,2,0.976
+32,3,0.983
+33,1,5.111
+33,2,3.477
+33,3,3.508
+34,1,10.275
+34,2,3.680
+34,3,3.682
+35,1,10.314
+35,2,3.657
+35,3,3.658
+36,1,1.385
+36,2,1.231
+36,3,1.252
+37,1,0.357
+37,2,0.141
+37,3,0.134
+38,1,0.217
+38,2,0.075
+38,3,0.076
+39,1,0.341
+39,2,0.140
+39,3,0.142
+40,1,0.506
+40,2,0.208
+40,3,0.225
+41,1,0.199
+41,2,0.071
+41,3,0.075
+42,1,0.191
+42,2,0.068
+42,3,0.064
+43,1,0.178
+43,2,0.058
+43,3,0.062
diff --git a/datafusion-partitioned/results/c6a.4xlarge.json b/datafusion-partitioned/results/c6a.4xlarge.json
@@ -1,56 +1,56 @@
 {
-    "system":  "DataFusion (Parquet, partitioned)",
-    "date": "2025-07-10",
+    "system": "DataFusion (Parquet, partitioned)",
+    "date": "2025-11-24",
     "machine": "c6a.4xlarge",
     "cluster_size": 1,
-    "proprietary":  "no",
-    "tuned":  "no",
-    "tags":  ["Rust","column-oriented","embedded","stateless", "lukewarm-cold-run"],
+    "proprietary": "no",
+    "tuned": "no",
+    "tags": ["Rust","column-oriented","embedded","stateless"],
     "load_time": 0,
     "data_size": 14737666736,
     "result": [
-        [0.058, 0.017, 0.015],
-        [0.116, 0.035, 0.037],
-        [0.2, 0.084, 0.088],
-        [0.43, 0.081, 0.084],
-        [1.086, 0.78, 0.799],
-        [0.977, 0.751, 0.756],
-        [0.086, 0.026, 0.026],
-        [0.125, 0.04, 0.037],
-        [1.011, 0.882, 0.862],
-        [1.349, 0.971, 0.983],
-        [0.565, 0.231, 0.24],
-        [0.677, 0.264, 0.265],
-        [1.062, 0.816, 0.82],
-        [2.769, 1.346, 1.201],
-        [1.135, 0.792, 0.78],
-        [1.021, 0.926, 0.916],
-        [2.638, 1.639, 1.63],
-        [2.585, 1.555, 1.592],
-        [5.159, 3.238, 3.24],
-        [0.26, 0.077, 0.077],
-        [10.045, 1.067, 1.082],
-        [11.424, 1.291, 1.269],
-        [22.117, 2.487, 2.511],
-        [55.492, 9.765, 9.851],
-        [2.825, 0.432, 0.423],
-        [0.853, 0.328, 0.33],
-        [2.837, 0.508, 0.504],
-        [9.744, 1.469, 1.478],
-        [9.444, 9.445, 9.475],
-        [0.515, 0.405, 0.415],
-        [2.433, 0.729, 0.735],
-        [6.158, 0.884, 0.891],
-        [4.608, 3.342, 3.281],
-        [10.221, 3.481, 3.455],
-        [10.145, 3.486, 3.46],
-        [1.261, 1.188, 1.168],
-        [0.309, 0.114, 0.114],
-        [0.175, 0.05, 0.048],
-        [0.313, 0.099, 0.117],
-        [0.451, 0.166, 0.192],
-        [0.183, 0.04, 0.043],
-        [0.171, 0.04, 0.041],
-        [0.143, 0.035, 0.037]
-]
+                [0.110,0.032,0.032],
+        [0.159,0.054,0.053],
+        [0.268,0.097,0.098],
+        [0.609,0.111,0.111],
+        [1.170,0.789,0.777],
+        [1.147,0.834,0.823],
+        [0.109,0.031,0.031],
+        [0.173,0.056,0.055],
+        [1.117,0.942,0.916],
+        [1.778,0.997,0.994],
+        [0.663,0.232,0.240],
+        [0.864,0.258,0.258],
+        [1.209,0.835,0.854],
+        [2.715,1.370,1.386],
+        [1.223,0.834,0.831],
+        [1.054,0.882,0.876],
+        [2.757,1.699,1.707],
+        [2.737,1.670,1.688],
+        [5.613,3.370,3.410],
+        [0.377,0.104,0.102],
+        [10.116,1.111,1.140],
+        [11.557,1.408,1.365],
+        [22.315,2.650,2.627],
+        [52.820,9.173,9.215],
+        [0.340,0.158,0.150],
+        [1.177,0.254,0.264],
+        [0.390,0.163,0.151],
+        [10.337,1.480,1.508],
+        [9.570,8.813,8.964],
+        [0.585,0.454,0.446],
+        [3.202,0.778,0.776],
+        [6.962,0.959,0.994],
+        [5.083,3.497,3.509],
+        [10.231,3.706,3.661],
+        [10.270,3.653,3.645],
+        [1.411,1.223,1.278],
+        [0.349,0.134,0.137],
+        [0.210,0.071,0.075],
+        [0.335,0.139,0.133],
+        [0.487,0.210,0.211],
+        [0.202,0.067,0.067],
+        [0.187,0.063,0.065],
+        [0.182,0.059,0.063]
+    ]
 }
diff --git a/datafusion-partitioned/save-result.sh b/datafusion-partitioned/save-result.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# This scripts converts the raw results.csv data from `benchmark.sh` into a the
+# final json format used by the benchmark dashboard.
+#
+# usage : ./save-result.sh <machine>
+#
+# example (save results/c6a.4xlarge.json)
+#         ./save-result.sh c6a.4xlarge
+
+MACHINE=$1
+OUTPUT_FILE="results/${MACHINE}.json"
+SYSTEM_NAME="DataFusion (Parquet, partitioned)"
+DATE=$(date +%Y-%m-%d)
+
+
+# Read the CSV and build the result array using sed
+RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf "        ["arr[i]"]"; if (i<length(arr)) printf ",\n"}}' result.csv)
+
+# form the final JSON structure from the template
+cat <<EOF > $OUTPUT_FILE
+{
+    "system": "$SYSTEM_NAME",
+    "date": "$DATE",
+    "machine": "$MACHINE",
+    "cluster_size": 1,
+    "proprietary": "no",
+    "tuned": "no",
+    "tags": ["Rust","column-oriented","embedded","stateless"],
+    "load_time": 0,
+    "data_size": 14737666736,
+    "result": [
+        $RESULT_ARRAY
+    ]
+}
+EOF
diff --git a/datafusion/README.md b/datafusion/README.md
@@ -1,16 +1,19 @@
 # DataFusion
 
-DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check <https://arrow.apache.org/datafusion/user-guide/introduction.html>
+[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check <https://arrow.apache.org/datafusion/user-guide/introduction.html>
+
+[Apache DataFusion]: https://arrow.apache.org/datafusion/
+[Apache Arrow]: https://arrow.apache.org/
 
 We use parquet file here and create an external table for it; and then execute the queries.
 
-## Generate benchmark results
+## Cookbook: Generate benchmark results
 
 The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2).
 
 1. manually start a AWS EC2 instance
     - `c6a.4xlarge`
-    - Ubuntu 22.04 or later
+    - Ubuntu 24.04 or later
     - Root 500GB gp2 SSD
     - no EBS optimized
     - no instance store
@@ -20,16 +23,16 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe
 1. `vi benchmark.sh` and modify following line to target Datafusion version
 
     ```bash
-    git checkout 46.0.0
+    git checkout 51.0.0
     ```
 
 1. `bash benchmark.sh`
+1. `./save-result.sh c6a.4xlarge`
 
 ### Know Issues
 
 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`)
 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`)
-3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050
 
 ## Generate full human readable results (for debugging)
 
diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh
@@ -11,9 +11,9 @@ sudo apt-get update -y
 sudo apt-get install -y gcc
 
 echo "Install DataFusion main branch"
-git clone https://github.com/apache/arrow-datafusion.git
-cd arrow-datafusion/
-git checkout 47.0.0
+git clone https://github.com/apache/datafusion.git
+cd datafusion/
+git checkout 51.0.0
 CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli
 export PATH="`pwd`/target/release:$PATH"
 cd ..
diff --git a/datafusion/save-result.sh b/datafusion/save-result.sh