-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
77 lines (65 loc) · 2.26 KB
/
config.example.yaml
File metadata and controls
77 lines (65 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
sql_server:
host: "192.168.20.203"
port: 1433
database: "hrmsdb"
schema: "dbo"
username: "your_username"
password: "your_password"
driver: "ODBC Driver 17 for SQL Server" # Not used - pymssql doesn't need this
duckdb:
database_path: "hrmsdb.duckdb"
memory_limit: "3GB"
threads: 4
sync:
batch_size: 10000
activity_log_days: 30 # Only import last 30 days for activity_log tables
# Hybrid classification
auto_classify: true # Automatically classify tables as live or import based on size
size_threshold: 100000 # Legacy 2-tier threshold (live vs DuckDB), overridden by live_threshold/parquet_threshold when parquet_enabled=true
force_live:
- "Activity_Log"
# Note: These tables must also be listed in the 'tables' section below
force_import:
- "CRMC_PayrollFile"
- "Attendance_031820"
- "CRMC_PayrollFile 03012021"
- "CRMC_PayrollFile 03152021"
- "CRMC_PayrollFile11132020"
# Cache settings
cache_live_tables: true # Create cache snapshots of live tables
cache_refresh_on_init: true # Refresh cache snapshots on initialization
# Parquet storage (NEW)
parquet_enabled: true
parquet_directory: "data/parquet"
parquet_compression: "zstd" # Options: snappy, zstd, gzip
# Three-tier thresholds (UPDATED)
live_threshold: 10000 # < 10K rows → live queries
parquet_threshold: 100000 # > 100K rows → Parquet storage
# Tables between thresholds → DuckDB
# Partitioning configuration (NEW)
partition_large_tables: true
partition_threshold: 1000000 # Partition if > 1M rows
auto_detect_partition_column: true
# Optional: Manual partition column specification
parquet_partitions:
CRMC_PayrollFile: "pay_date"
Attendance_031820: "attendance_date"
# Manual storage overrides (NEW)
force_parquet: # Force Parquet even if < 100K rows
- "HistoricalData"
tables:
# Activity logs (will be filtered to last 30 days)
- "Activity_Log"
# Attendance data
- "Attendance_031820"
- "Attendance_031920"
- "Attendanceddta_031920"
# Payroll data
- "CRMC_PayrollFile 03012021"
- "CRMC_PayrollFile 03152021"
- "CRMC_PayrollFile"
- "CRMC_PayrollFile11132020"
# 1099 and 401k tables
- "1099emps_050420"
- "401kdata_031522"
- "401kreport_071620"