Skip to content

Commit e5fab9e

Browse files
authored
Add readme (#29)
1 parent 0d4cf7a commit e5fab9e

File tree

5 files changed

+200
-103
lines changed

5 files changed

+200
-103
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: CSV Size Report
2+
3+
on:
4+
push:
5+
workflow_dispatch:
6+
7+
jobs:
8+
report:
9+
runs-on: ubuntu-latest
10+
defaults:
11+
run:
12+
shell: bash
13+
14+
steps:
15+
- name: Check out repository
16+
uses: actions/checkout@v3
17+
with:
18+
fetch-depth: 0
19+
20+
- name: Set up Python 3.10
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: '3.10'
24+
25+
- name: Install make
26+
run: |
27+
sudo apt-get update
28+
sudo apt-get install -y make
29+
30+
# Set FASTLANES_DATA_DIR to repo root
31+
- name: Set FASTLANES_DATA_DIR environment variable
32+
run: echo "FASTLANES_DATA_DIR=${GITHUB_WORKSPACE}" >> $GITHUB_ENV
33+
34+
- name: Generate CSV size report
35+
run: make csv_size_report
36+
37+
- name: Show generated CSV
38+
run: |
39+
echo "→ csv_sizes_report.csv contents:"
40+
cat csv_sizes_report.csv
41+
42+
- name: Upload report artifact
43+
uses: actions/upload-artifact@v4
44+
with:
45+
name: csv-size-report
46+
path: csv_sizes_report.csv

Makefile

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
11
# Makefile for FastLanes Data workflows
22

3-
PYTHON := python3
4-
SCRIPT := public_bi_extract_schemas.py
5-
VENV_DIR := venv
6-
ENV_SCRIPT := export_fastlanes_data_dir.sh
7-
REFORMAT := reformat_csvs.py
3+
# Force all recipes to run under Bash (so "set -o pipefail" works)
4+
SHELL := /bin/bash
85

9-
.PHONY: all env install get_public_bi_schemas reformat_csvs clean
6+
PYTHON := python3
7+
SCRIPT := public_bi_extract_schemas.py
8+
VENV_DIR := venv
9+
ENV_SCRIPT := export_fastlanes_data_dir.sh
10+
REFORMAT := reformat_csvs.py
11+
CSV_SIZE_REPORT := csv_size_report.py
1012

11-
# Default: load env, create venv, and run schema extraction
12-
all: env install get_public_bi_schemas
13+
.PHONY: all env install get_public_bi_schemas reformat_csvs csv_size_report clean
1314

14-
# Load FASTLANES_DATA_DIR and other env vars
15-
env:
16-
@echo "Loading environment variables..."
17-
. $(ENV_SCRIPT)
15+
# Default: load env, create venv, and run schema extraction
16+
all: install get_public_bi_schemas
1817

1918
# Set up (if needed) and install into virtual environment
2019
install:
@@ -32,17 +31,25 @@ install:
3231
fi
3332

3433
# Run the BI schema extraction script
35-
get_public_bi_schemas: install env
34+
get_public_bi_schemas: install
3635
@echo "Extracting public BI schemas..."
3736
cd scripts && . ../$(VENV_DIR)/bin/activate && $(PYTHON) $(SCRIPT)
3837

3938
# Re-format all CSV files under NextiaJD
40-
reformat_csvs: install env
39+
reformat_csvs: install
4140
@echo "Re-formatting all CSV files under NextiaJD..."
4241
. $(VENV_DIR)/bin/activate && \
4342
$(PYTHON) scripts/$(REFORMAT) $(FASTLANES_DATA_DIR)/NextiaJD
4443

44+
# --------------------------------------------------------------------
45+
# New target: run the CSV‐size report script and save to csv_sizes_report.csv
46+
csv_size_report: install
47+
@echo "Generating CSV size report..."
48+
. $(VENV_DIR)/bin/activate && \
49+
$(PYTHON) scripts/$(CSV_SIZE_REPORT) > csv_sizes_report.csv
50+
@echo "→ csv_sizes_report.csv created."
51+
4552
# Clean up generated files and virtual environment
4653
clean:
4754
@echo "Cleaning up..."
48-
rm -rf $(VENV_DIR) public_bi_benchmark ../public_bi/tables
55+
rm -rf $(VENV_DIR) public_bi_benchmark ../public_bi/tables csv_sizes_report.csv

README.md

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,35 @@
1-
# Sample data
2-
---
3-
# Datasets:
4-
- NextiaJD
5-
- public_bi
6-
- ssb
7-
- tpch
8-
- ziya
9-
- clickbench
1+
# FastLanes\_Data
2+
3+
FastLanes\_Data is a collection of benchmark datasets and utility scripts designed to streamline performance testing and
4+
analysis of the [FastLanes](https://github.com/cwida/FastLanes) file format. It brings together real-world and synthetic
5+
datasets to test FastLanes in all cases.
6+
7+
## Datasets
8+
9+
* **NextiaJD**:
10+
* **public\_bi**: Sample of [public BI dataset](https://github.com/cwida/public_bi_benchmark).
11+
* **issues/cwida/alp/37**: Data extracted for issue #37 in the [ALP](https://github.com/cwida/ALP) project.
12+
13+
## Requirements
14+
15+
* **Python**: Version 3.8 or higher
16+
* **Bash**: For shell-based export scripts
17+
* Optional: Any additional dependencies listed in `requirements.txt` (if present)
18+
19+
**Export the data directory** using the provided script:
20+
21+
```bash
22+
source scripts/export_fastlanes_data_dir.sh /path/to/your/data
23+
```
24+
25+
This sets the `$FASTLANES_DATA_DIR` environment variable to the root path of your datasets.
26+
27+
## License
28+
29+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
30+
31+
## Contact
32+
33+
Come talk to us if you like FastLanes or Data!
34+
35+
[![Discord](https://img.shields.io/discord/discord.gg/gwx87YYn?label=Discord\&style=flat-square)](https://discord.gg/gwx87YYn)

export_fastlanes_data_dir.sh

100644100755
Lines changed: 60 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,60 @@
1-
#!/usr/bin/env bash
2-
# install_fastlanes_env.sh
3-
# ─────────────────────────────────────────────────────────────
4-
# Sets FASTLANES_DATA_DIR for:
5-
# • this shell
6-
# • every future shell (bash/zsh)
7-
# • all GUI apps (via LaunchAgent) – e.g. CLion
8-
# ─────────────────────────────────────────────────────────────
9-
10-
set -euo pipefail
11-
12-
# 0) Hard-coded canonical path to your FastLanes_Data clone
13-
DATA_DIR="$HOME/CLionProjects/FastLanes_Data"
14-
if [[ ! -d "$DATA_DIR" ]]; then
15-
echo "❌ Expected FastLanes_Data at: $DATA_DIR" >&2
16-
echo " Clone or move the repo there, then rerun this installer." >&2
17-
exit 1
18-
fi
19-
20-
# 1) Export for **this** shell so you can build immediately
21-
export FASTLANES_DATA_DIR="$DATA_DIR"
22-
echo "FASTLANES_DATA_DIR set for current shell → $FASTLANES_DATA_DIR"
23-
24-
# 2) Install a silent helper into ~/.local/bin and source it from rc files
25-
HELPER_DIR="$HOME/.local/bin"
26-
HELPER_PATH="$HELPER_DIR/export_fastlanes_data_dir.sh"
27-
mkdir -p "$HELPER_DIR"
28-
29-
cat >"$HELPER_PATH" <<'EOSH'
30-
#!/usr/bin/env bash
31-
# silent helper – just export the var if not already
32-
FASTLANES_DATA_DIR_DEFAULT="$HOME/CLionProjects/FastLanes_Data"
33-
export FASTLANES_DATA_DIR="${FASTLANES_DATA_DIR:-$FASTLANES_DATA_DIR_DEFAULT}"
34-
EOSH
35-
chmod +x "$HELPER_PATH"
36-
37-
add_source_line() {
38-
local rcfile="$1"
39-
local marker="# >>> FastLanes_Data export >>>"
40-
local line="source \"$HELPER_PATH\" >/dev/null 2>&1"
41-
if ! grep -Fq "$marker" "$rcfile" 2>/dev/null; then
42-
printf "\n%s\n%s\n# <<< FastLanes_Data export <<<\n" "$marker" "$line" >>"$rcfile"
43-
echo "✓ Added helper source to $rcfile"
44-
fi
45-
}
46-
47-
# Pick the correct rc file
48-
case "${SHELL##*/}" in
49-
zsh) RC_FILE="$HOME/.zshrc" ;;
50-
bash) RC_FILE="${HOME}/.bash_profile"; [[ -f "$HOME/.bashrc" ]] && RC_FILE="$HOME/.bashrc" ;;
51-
*) RC_FILE="$HOME/.profile" ;;
52-
esac
53-
add_source_line "$RC_FILE"
54-
55-
# 3) Create a LaunchAgent so GUI apps inherit the var at login
56-
PLIST="$HOME/Library/LaunchAgents/com.fastlanes.setenv.plist"
57-
cat >"$PLIST" <<EOF
58-
<?xml version="1.0" encoding="UTF-8"?>
59-
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
60-
<plist version="1.0">
61-
<dict>
62-
<key>Label</key> <string>com.fastlanes.setenv</string>
63-
<key>ProgramArguments</key> <array>
64-
<string>launchctl</string>
65-
<string>setenv</string>
66-
<string>FASTLANES_DATA_DIR</string>
67-
<string>$DATA_DIR</string>
68-
</array>
69-
<key>RunAtLoad</key> <true/>
70-
</dict>
71-
</plist>
72-
EOF
73-
74-
# Load (or reload) the agent right away
75-
launchctl unload "$PLIST" 2>/dev/null || true
76-
launchctl load "$PLIST"
77-
echo "✓ LaunchAgent installed & loaded (GUI apps will now see FASTLANES_DATA_DIR)"
78-
79-
echo -e "\n✅ Done. Log out and back in once (or reboot) so CLion started from the Dock inherits the variable."
1+
# Makefile for FastLanes Data workflows
2+
3+
# Force all recipes to run under Bash (so "set -o pipefail" works)
4+
SHELL := /bin/bash
5+
6+
PYTHON := python3
7+
SCRIPT := public_bi_extract_schemas.py
8+
VENV_DIR := venv
9+
ENV_SCRIPT := export_fastlanes_data_dir.sh
10+
REFORMAT := reformat_csvs.py
11+
CSV_SIZE_REPORT := csv_size_report.py
12+
13+
.PHONY: all env install get_public_bi_schemas reformat_csvs csv_size_report clean
14+
15+
# Default: load env, create venv, and run schema extraction
16+
all: env install get_public_bi_schemas
17+
18+
# Load FASTLANES_DATA_DIR and other env vars
19+
env:
20+
@echo "Loading environment variables..."
21+
. $(CURDIR)/$(ENV_SCRIPT)
22+
23+
# Set up (if needed) and install into virtual environment
24+
install:
25+
@if [ ! -d $(VENV_DIR) ]; then \
26+
echo "Creating virtual environment..."; \
27+
$(PYTHON) -m venv $(VENV_DIR); \
28+
fi
29+
@echo "Upgrading pip..."
30+
. $(VENV_DIR)/bin/activate && pip install --upgrade pip
31+
@echo "Installing required Python packages..."
32+
. $(VENV_DIR)/bin/activate && pip install pyyaml pandas
33+
@if [ -f requirements.txt ]; then \
34+
echo "Installing dependencies from requirements.txt..."; \
35+
. $(VENV_DIR)/bin/activate && pip install -r requirements.txt; \
36+
fi
37+
38+
# Run the BI schema extraction script
39+
get_public_bi_schemas: install env
40+
@echo "Extracting public BI schemas..."
41+
cd scripts && . ../$(VENV_DIR)/bin/activate && $(PYTHON) $(SCRIPT)
42+
43+
# Re-format all CSV files under NextiaJD
44+
reformat_csvs: install env
45+
@echo "Re-formatting all CSV files under NextiaJD..."
46+
. $(VENV_DIR)/bin/activate && \
47+
$(PYTHON) scripts/$(REFORMAT) $(FASTLANES_DATA_DIR)/NextiaJD
48+
49+
# --------------------------------------------------------------------
50+
# New target: run the CSV‐size report script and save to csv_sizes_report.csv
51+
csv_size_report: install env
52+
@echo "Generating CSV size report..."
53+
. $(VENV_DIR)/bin/activate && \
54+
$(PYTHON) scripts/$(CSV_SIZE_REPORT) > csv_sizes_report.csv
55+
@echo "→ csv_sizes_report.csv created."
56+
57+
# Clean up generated files and virtual environment
58+
clean:
59+
@echo "Cleaning up..."
60+
rm -rf $(VENV_DIR) public_bi_benchmark ../public_bi/tables csv_sizes_report.csv

scripts/csv_size_report.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import sys
4+
5+
# Resolve NextiaJD relative to this script’s location,
6+
# then make it absolute so os.walk can find it.
7+
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
8+
ROOT_DIR = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "NextiaJD","tables"))
9+
10+
# Sanity check: does the directory exist?
11+
if not os.path.isdir(ROOT_DIR):
12+
sys.stderr.write(f"Error: ROOT_DIR '{ROOT_DIR}' does not exist or is not a directory\n")
13+
sys.exit(1)
14+
15+
def report_csv_sizes(root_dir):
16+
"""
17+
Walk through root_dir, find all .csv files, and print their
18+
table_name, version, and file_size in CSV format.
19+
"""
20+
print("table_name,version,file_size")
21+
for dirpath, dirnames, filenames in os.walk(root_dir):
22+
for fn in filenames:
23+
if fn.lower().endswith(".csv"):
24+
full_path = os.path.join(dirpath, fn)
25+
try:
26+
size = os.path.getsize(full_path)
27+
except OSError as e:
28+
print(f"# could not get size for {full_path}: {e}", file=sys.stderr)
29+
continue
30+
31+
table_name = os.path.splitext(fn)[0]
32+
version = "csv"
33+
print(f"{table_name},{version},{size}")
34+
35+
if __name__ == "__main__":
36+
print(f"# Scanning: {ROOT_DIR}", file=sys.stderr)
37+
report_csv_sizes(ROOT_DIR)

0 commit comments

Comments
 (0)