binpash
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎testing/s3_transfer_benchmarks/README.md‎
Lines changed: 43 additions & 0 deletions b/‎testing/s3_transfer_benchmarks/README.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎testing/s3_transfer_benchmarks/analysis/plot.py‎
Lines changed: 178 additions & 0 deletions b/‎testing/s3_transfer_benchmarks/analysis/plot.py‎
Lines changed: 178 additions & 0 deletions
diff --git a/‎testing/s3_transfer_benchmarks/analysis/s3_performance_comparison.png‎
172 KB b/‎testing/s3_transfer_benchmarks/analysis/s3_performance_comparison.png‎
172 KB
diff --git a/‎testing/s3_transfer_benchmarks/analysis/s3_performance_complete.png‎
352 KB b/‎testing/s3_transfer_benchmarks/analysis/s3_performance_complete.png‎
352 KB
diff --git a/‎testing/s3_transfer_benchmarks/ec2_s3_benchmark.py‎
Lines changed: 76 additions & 0 deletions b/‎testing/s3_transfer_benchmarks/ec2_s3_benchmark.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎testing/s3_transfer_benchmarks/lambda_workers/deploy_lambda_sort.sh‎
Lines changed: 66 additions & 0 deletions b/‎testing/s3_transfer_benchmarks/lambda_workers/deploy_lambda_sort.sh‎
Lines changed: 66 additions & 0 deletions
@@ -72,6 +72,10 @@ evaluation/benchmarks/**/pash_graphviz_*/
 
 # Local experimental testing artifacts
 test_pretty_print.py
-testing/
+testing/*
+!testing/s3_transfer_benchmarks/
+!testing/s3_transfer_benchmarks/**
+testing/legacy/
 testing2/
 sort_inout_splits/
+pipeline_io/
@@ -0,0 +1,43 @@
+# S3 Transfer Benchmarks
+
+This folder contains the organized benchmark assets used to compare data transfer paths for serverless PaSh testing.
+
+## What is measured
+
+1. `S3 -> EC2` full object download timing
+2. `S3 -> Lambda` full object download timing
+3. `S3 -> Lambda` byte-range download timing
+4. `S3 -> EC2 -> Lambda` streaming timing
+
+## Layout
+
+- `ec2_s3_benchmark.py`: direct S3-to-EC2 pull benchmark script
+- `orchestrators/`: manual orchestrators used to invoke Lambda sort workers
+- `lambda_workers/`: Lambda worker handlers and deployment scripts
+- `analysis/plot.py`: plotting script for benchmark results
+- `analysis/*.png`: generated comparison figures
+
+## Quick usage
+
+From this directory:
+
+```bash
+# Deploy worker lambdas
+./lambda_workers/deploy_lambda_sort.sh
+./lambda_workers/deploy_lambda_sort_byte_ranges.sh
+
+# Run byte-range orchestrator
+python3 orchestrators/manual_s3_orchestrator_byte_ranges.py \
+  --bucket "$AWS_BUCKET" \
+  --input oneliners/inputs/1G.txt \
+  --output oneliners/outputs/byte-range-result.txt \
+  --workers 2
+
+# Plot current benchmark summary
+python3 analysis/plot.py
+```
+
+## Notes
+
+- Legacy/ad-hoc experimental artifacts were moved under `testing/legacy/` and are intentionally not committed.
+- Deployment scripts package Lambda zips locally in `lambda_workers/`.
@@ -0,0 +1,178 @@
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Complete data
+sizes_mib = [100, 500, 1024]
+sizes_labels = ['100 MiB', '500 MiB', '1 GiB']
+
+# S3 to Lambda (full files)
+lambda_full_avg = [1.14, 6.70, 13.55]
+lambda_full_min = [1.07, 5.28, 13.19]
+lambda_full_max = [1.38, 7.08, 14.13]
+lambda_full_p90 = [1.34, 7.06, 14.06]
+
+
+# S3 to EC2 (full files)
+ec2_avg = [1.14, 5.34, 10.85]
+ec2_min = [1.14, 5.33, 10.58]
+ec2_max = [1.15, 5.36, 11.88]
+ec2_p90 = [1.15, 5.35, 11.68]
+
+# S3 to Lambda (byte range)
+lambda_range_avg = [1.13, 6.59, 13.35]
+lambda_range_min = [1.07, 5.26, 11.95]
+lambda_range_max = [1.38, 7.02, 14.11]
+lambda_range_p90 = [1.34, 6.99, 13.47]
+
+# S3 to EC2 to Lambda (streaming, /dev/null)
+streaming_avg = [4.027, 7.840, 14.650]
+streaming_min = [3.957, 7.427, 14.542]
+streaming_max = [4.079, 7.999, 14.726]
+streaming_p90 = [4.079, 7.999, 14.726]  # Using max as approximation
+
+# Calculate error bars (distance from average to min/max)
+ec2_err_lower = [ec2_avg[i] - ec2_min[i] for i in range(3)]
+ec2_err_upper = [ec2_max[i] - ec2_avg[i] for i in range(3)]
+lambda_full_err_lower = [lambda_full_avg[i] - lambda_full_min[i] for i in range(3)]
+lambda_full_err_upper = [lambda_full_max[i] - lambda_full_avg[i] for i in range(3)]
+lambda_range_err_lower = [lambda_range_avg[i] - lambda_range_min[i] for i in range(3)]
+lambda_range_err_upper = [lambda_range_max[i] - lambda_range_avg[i] for i in range(3)]
+streaming_err_lower = [streaming_avg[i] - streaming_min[i] for i in range(3)]
+streaming_err_upper = [streaming_max[i] - streaming_avg[i] for i in range(3)]
+
+# Create figure with 2 subplots
+fig = plt.figure(figsize=(14, 6))
+
+# Plot 1: Average times comparison with error bars
+ax1 = plt.subplot(1, 2, 1)
+x = np.arange(len(sizes_labels))
+width = 0.2  # Narrower bars to fit 4 series
+
+bars1 = ax1.bar(x - 1.5*width, ec2_avg, width, label='S3→EC2 (download)',
+                color='#2ecc71', alpha=0.8,
+                yerr=[ec2_err_lower, ec2_err_upper], capsize=4, error_kw={'linewidth': 1.5})
+bars2 = ax1.bar(x - 0.5*width, lambda_full_avg, width, label='S3→Lambda (direct)',
+                color='#e74c3c', alpha=0.8,
+                yerr=[lambda_full_err_lower, lambda_full_err_upper], capsize=4, error_kw={'linewidth': 1.5})
+bars3 = ax1.bar(x + 0.5*width, lambda_range_avg, width,
+                label='S3→Lambda (byte range)', color='#3498db', alpha=0.8,
+                yerr=[lambda_range_err_lower, lambda_range_err_upper], capsize=4, error_kw={'linewidth': 1.5})
+bars4 = ax1.bar(x + 1.5*width, streaming_avg, width,
+                label='S3→EC2→Lambda (stream)', color='#9b59b6', alpha=0.8,
+                yerr=[streaming_err_lower, streaming_err_upper], capsize=4, error_kw={'linewidth': 1.5})
+
+ax1.set_xlabel('File Size', fontsize=12, fontweight='bold')
+ax1.set_ylabel('Average Time (seconds)', fontsize=12, fontweight='bold')
+ax1.set_title('S3 Download Performance Comparison', fontsize=14, fontweight='bold')
+ax1.set_xticks(x)
+ax1.set_xticklabels(sizes_labels)
+ax1.legend(fontsize=9)
+ax1.grid(axis='y', alpha=0.3)
+
+# Add percentage labels showing streaming overhead vs direct Lambda
+for i in range(len(sizes_labels)):
+    if streaming_avg[i] and lambda_full_avg[i]:
+        overhead = ((streaming_avg[i] - lambda_full_avg[i]) / lambda_full_avg[i]) * 100
+        if abs(overhead) > 1:  # Only show if meaningful difference
+            ax1.text(i + 1.5*width, streaming_avg[i] + 0.5, f'+{overhead:.0f}%',
+                    ha='center', fontsize=8, fontweight='bold', color='#8e44ad')
+
+# Plot 2: Throughput (MB/s)
+ax2 = plt.subplot(1, 2, 2)
+
+throughput_ec2 = [sizes_mib[i] / ec2_avg[i] for i in range(3)]
+throughput_lambda_full = [sizes_mib[i] / lambda_full_avg[i] for i in range(3)]
+throughput_lambda_range = [sizes_mib[i] / lambda_range_avg[i] for i in range(3)]
+throughput_streaming = [sizes_mib[i] / streaming_avg[i] for i in range(3)]
+
+ax2.plot(sizes_labels, throughput_ec2, 'o-', color='#2ecc71',
+         linewidth=2, markersize=10, label='S3→EC2')
+ax2.plot(sizes_labels, throughput_lambda_full, 's-', color='#e74c3c',
+         linewidth=2, markersize=10, label='S3→Lambda (direct)')
+ax2.plot(sizes_labels, throughput_lambda_range, '^-', color='#3498db',
+         linewidth=2, markersize=10, label='S3→Lambda (range)')
+ax2.plot(sizes_labels, throughput_streaming, 'd-', color='#9b59b6',
+         linewidth=2, markersize=10, label='S3→EC2→Lambda (stream)')
+
+ax2.set_xlabel('File Size', fontsize=12, fontweight='bold')
+ax2.set_ylabel('Throughput (MiB/s)', fontsize=12, fontweight='bold')
+ax2.set_title('Network Throughput by File Size', fontsize=14, fontweight='bold')
+ax2.legend(fontsize=9)
+ax2.grid(alpha=0.3)
+
+# Add throughput values
+for i, size in enumerate(sizes_labels):
+    ax2.text(i, throughput_ec2[i] + 2, f'{throughput_ec2[i]:.0f}', 
+            ha='center', fontsize=8, color='#27ae60')
+    ax2.text(i, throughput_lambda_full[i] - 3, f'{throughput_lambda_full[i]:.0f}',
+            ha='center', fontsize=8, color='#c0392b')
+
+# Add note about sample size and warm-up
+fig.text(0.5, 0.02, 'N=9 runs per configuration (excluding 1 cold start run for Lambda - all measurements use warmed-up Lambdas)',
+         ha='center', fontsize=9, style='italic', color='#555555')
+
+plt.tight_layout(rect=[0, 0.03, 1, 1])  # Make room for the note at bottom
+output_path = Path(__file__).resolve().parent / "s3_performance_complete.png"
+plt.savefig(output_path, dpi=300, bbox_inches='tight')
+print(f"Saved: {output_path}")
+
+# Print comprehensive summary
+print("\n" + "="*60)
+print("📊 S3 DOWNLOAD PERFORMANCE ANALYSIS")
+print("="*60)
+
+print("\n🔍 KEY FINDINGS:\n")
+
+print("1. EC2 Advantage Grows with File Size:")
+for i, size in enumerate(sizes_labels):
+    speedup = ((lambda_full_avg[i] - ec2_avg[i]) / lambda_full_avg[i]) * 100
+    print(f"   {size:>8}: {speedup:>5.1f}% faster")
+
+print("\n2. Throughput Analysis:")
+for i, size in enumerate(sizes_labels):
+    print(f"   {size:>8}: EC2={throughput_ec2[i]:>6.1f} MiB/s  |  Lambda={throughput_lambda_full[i]:>6.1f} MiB/s")
+
+print("\n3. Consistency (Variance):")
+variance_ec2 = [ec2_max[i] - ec2_min[i] for i in range(3)]
+variance_lambda_full = [lambda_full_max[i] - lambda_full_min[i] for i in range(3)]
+variance_lambda_range = [lambda_range_max[i] - lambda_range_min[i] for i in range(3)]
+for i, size in enumerate(sizes_labels):
+    print(f"   {size:>8}: EC2={variance_ec2[i]:>5.2f}s  |  Lambda={variance_lambda_full[i]:>5.2f}s  ({variance_lambda_full[i]/variance_ec2[i]:.1f}x more variable)")
+
+print("\n4. Byte Range Overhead:")
+for i, size in enumerate(sizes_labels):
+    overhead = ((lambda_range_avg[i] - lambda_full_avg[i]) / lambda_full_avg[i]) * 100
+    print(f"   {size:>8}: {overhead:>+5.1f}% (essentially zero!)")
+
+print("\n5. Streaming S3→EC2→Lambda Overhead (vs Direct S3→Lambda):")
+variance_streaming = [streaming_max[i] - streaming_min[i] for i in range(3)]
+for i, size in enumerate(sizes_labels):
+    overhead = ((streaming_avg[i] - lambda_full_avg[i]) / lambda_full_avg[i]) * 100
+    print(f"   {size:>8}: {overhead:>+5.1f}% slower (streaming overhead)")
+
+print("\n✅ CONCLUSIONS:")
+print("   • EC2 is 0-25% faster than Lambda for downloads (scales with file size)")
+print("   • EC2 has 8-30x lower variance (much more consistent)")
+print("   • Byte ranges have ZERO performance penalty")
+print("   • Streaming S3→EC2→Lambda is SLOWER than direct S3→Lambda:")
+print("     - 100 MiB: 3.5x slower (extra hop overhead dominates)")
+print("     - 500 MiB: 1.2x slower")
+print("     -   1 GiB: 1.1x slower")
+print("   • Direct S3→Lambda is always faster - streaming adds latency without benefit")
+print("="*60)
+
+# Create a detailed table
+print("\n📋 DETAILED TIMING TABLE:")
+print("-"*95)
+print(f"{'Size':<10} {'Method':<25} {'Avg':<8} {'Min':<8} {'Max':<8} {'P90':<8} {'Variance':<10} {'Throughput':<12}")
+print("-"*95)
+for i, size in enumerate(sizes_labels):
+    print(f"{size:<10} {'EC2 (download)':<25} {ec2_avg[i]:<8.2f} {ec2_min[i]:<8.2f} {ec2_max[i]:<8.2f} {ec2_p90[i]:<8.2f} {variance_ec2[i]:<10.2f} {throughput_ec2[i]:<12.1f}")
+    print(f"{'':<10} {'Lambda (direct)':<25} {lambda_full_avg[i]:<8.2f} {lambda_full_min[i]:<8.2f} {lambda_full_max[i]:<8.2f} {lambda_full_p90[i]:<8.2f} {variance_lambda_full[i]:<10.2f} {throughput_lambda_full[i]:<12.1f}")
+    print(f"{'':<10} {'Lambda (byte range)':<25} {lambda_range_avg[i]:<8.2f} {lambda_range_min[i]:<8.2f} {lambda_range_max[i]:<8.2f} {lambda_range_p90[i]:<8.2f} {variance_lambda_range[i]:<10.2f} {throughput_lambda_range[i]:<12.1f}")
+    print(f"{'':<10} {'S3→EC2→Lambda (stream)':<25} {streaming_avg[i]:<8.2f} {streaming_min[i]:<8.2f} {streaming_max[i]:<8.2f} {streaming_p90[i]:<8.2f} {variance_streaming[i]:<10.2f} {throughput_streaming[i]:<12.1f}")
+    if i < len(sizes_labels) - 1:
+        print("-"*95)
+print("-"*95)
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+EC2 script to benchmark S3 download performance
+Measures time to pull 1G or 500M file from S3
+"""
+import boto3
+import botocore.config
+import time
+import uuid
+
+BUCKET = "inout741448956691"
+
+KEYS = [
+    "unix50/inputs/1_20G.txt",
+    "oneliners/inputs/1G.txt",
+    "oneliners/inputs/500M.txt",
+    "oneliners/inputs/100M.txt",
+]
+
+KEY = KEYS[0]  # Change this to select different file sizes
+
+def run_benchmark(event=None, context=None):
+    runtimes = []
+
+    for i in range(10):
+        # Create a fresh S3 client each iteration to avoid connection reuse
+        config = botocore.config.Config(
+            max_pool_connections=1,
+            retries={'max_attempts': 0}
+        )
+        s3 = boto3.client("s3", config=config)
+
+        # Add a random query param to avoid any caching
+        unique_key = f"{KEY}?nocache={uuid.uuid4().hex}"
+
+        t0 = time.time()
+        res = s3.get_object(Bucket=BUCKET, Key=KEY)
+        for _ in res['Body'].iter_chunks(chunk_size=1024*1024):
+            pass
+
+        t1 = time.time()
+
+        #if i == 0: # first run is cold start garbage
+        #    continue
+
+        dt = t1 - t0
+        runtimes.append(dt)
+        print(f"Run {i+1}: {dt:.2f}s")#, size pulled: {len(data)/1e6:.1f} MB")
+
+        #del data  # free memory
+
+    # Compute statistics
+    avg_ = sum(runtimes) / len(runtimes)
+    mn = min(runtimes)
+    mx = max(runtimes)
+    sorted_r = sorted(runtimes)
+    p90_index = max(0, int(len(sorted_r) * 0.9) - 1)
+    p90 = sorted_r[p90_index]
+
+    print(f"Average: {avg_:.2f}s")
+    print(f"Min: {mn:.2f}s")
+    print(f"Max: {mx:.2f}s")
+    print(f"P90: {p90:.2f}s")
+
+    return {
+        "avg": avg_,
+        "min": mn,
+        "max": mx,
+        "p90": p90,
+        "runs": runtimes
+    }
+
+if __name__ == "__main__":
+    # Run benchmark on EC2
+    result = run_benchmark()
+    print("\nFinal results:", result)
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Deploy lambda-sort worker function
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "=========================================="
+echo "Deploying Lambda Sort Worker"
+echo "=========================================="
+
+# Check environment variables
+if [ -z "$AWS_ACCOUNT_ID" ]; then
+    echo "Error: AWS_ACCOUNT_ID environment variable not set"
+    exit 1
+fi
+
+# Package the Lambda function
+echo "[Step 1] Packaging Lambda function..."
+cd "$SCRIPT_DIR"
+zip -q lambda_sort_worker.zip lambda_sort_worker.py
+echo "  ✓ Created lambda_sort_worker.zip"
+
+# Check if function exists
+FUNCTION_EXISTS=$(aws lambda get-function --function-name lambda-sort 2>&1 | grep -c "ResourceNotFoundException" || true)
+
+if [ "$FUNCTION_EXISTS" -eq 1 ]; then
+    # Create new function
+    echo "[Step 2] Creating new Lambda function 'lambda-sort'..."
+
+    aws lambda create-function \
+        --function-name lambda-sort \
+        --runtime python3.9 \
+        --role arn:aws:iam::${AWS_ACCOUNT_ID}:role/pash-release-us-east-1-lambdaRole \
+        --handler lambda_sort_worker.lambda_handler \
+        --zip-file fileb://lambda_sort_worker.zip \
+        --timeout 300 \
+        --memory-size 3008 \
+        --ephemeral-storage Size=2048 \
+        --region us-east-1
+
+    echo "  ✓ Lambda function created"
+else
+    # Update existing function
+    echo "[Step 2] Updating existing Lambda function 'lambda-sort'..."
+
+    aws lambda update-function-code \
+        --function-name lambda-sort \
+        --zip-file fileb://lambda_sort_worker.zip \
+        --region us-east-1
+
+    echo "  ✓ Lambda function updated"
+fi
+
+echo
+echo "=========================================="
+echo "✓ Deployment complete!"
+echo "Function name: lambda-sort"
+echo "=========================================="
+echo
+echo "You can now test with:"
+echo "  python3 ../orchestrators/manual_s3_orchestrator.py \\"
+echo "    --bucket \$AWS_BUCKET \\"
+echo "    --input oneliners/inputs/1M.txt \\"
+echo "    --output oneliners/outputs/manual-sort-result.txt \\"
+echo "    --workers 2"