TRELLIS-BOX/Dockerfile at main · off-by-some/TRELLIS-BOX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# syntax=docker/dockerfile:1.6

# =============================================================================
# Build Configuration Variables
# =============================================================================
# You can override these at build time using --build-arg
#
# Example:
#   docker build \
#     --build-arg CUDA_VERSION=12.3.2 \
#     --build-arg PYTHON_VERSION=3.11 \
#     --build-arg APP_PORT=8080 \
#     --build-arg CACHE_DIR=/data/.cache \
#     -t trellis-box:latest .
#
# Or use docker-compose with build args in docker-compose.yml
# See docker.env.example for all available configuration options
# =============================================================================
# CUDA and System
ARG CUDA_VERSION=12.3.2
ARG CUDNN_VERSION=9
ARG UBUNTU_VERSION=22.04
ARG PYTHON_VERSION=3.10

# Python Package Versions
ARG POETRY_VERSION=1.8.3
ARG TORCH_VERSION=2.4.0
ARG KAOLIN_VERSION=0.17.0
ARG KAOLIN_INDEX_URL=https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.4.0_cu121.html

# CUDA Architecture List for compiling PyTorch extensions
# Specify which GPU architectures to compile for (space-separated compute capabilities)
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0"

# Application Configuration
ARG APP_USER=appuser
ARG APP_UID=1000
ARG APP_PORT=8501
ARG STREAMLIT_SERVER_ADDRESS=0.0.0.0
ARG STREAMLIT_SERVER_HEADLESS=true

# Cache Directories (inside container)
ARG CACHE_DIR=/home/appuser/.cache
ARG HF_CACHE_DIR=/home/appuser/.cache/huggingface
ARG REMBG_CACHE_DIR=/app/rembg_cache
ARG TRELLIS_OUTPUT_DIR=/app/outputs

# =============================================================================
# Builder Stage
# =============================================================================
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS builder

# Prevent interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1

# Re-declare ARGs needed in this stage
ARG PYTHON_VERSION
ARG POETRY_VERSION
ARG KAOLIN_VERSION
ARG KAOLIN_INDEX_URL
ARG TORCH_CUDA_ARCH_LIST

# Set working directory
WORKDIR /app

# Install system dependencies and Python
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked <<EOF
set -e
apt-get update
apt-get install -y --no-install-recommends \
    git \
    python${PYTHON_VERSION} \
    python${PYTHON_VERSION}-dev \
    python3-pip \
    build-essential
rm -rf /var/lib/apt/lists/*
EOF

# Upgrade pip and install poetry
RUN --mount=type=cache,target=/root/.cache/pip <<EOF
set -e
pip install --upgrade pip setuptools wheel
pip install poetry==${POETRY_VERSION}
EOF

# Configure poetry
RUN <<EOF
set -e
poetry config virtualenvs.create false
poetry config installer.max-workers 10
EOF

# Copy dependency files (including poetry.lock if it exists)
COPY pyproject.toml poetry.lock* ./
COPY extensions/ ./extensions/

# Install application dependencies. We keep this separate from the other dependencies to
# avoid re-installing the same dependencies if kaolin fails to install.
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/pypoetry <<EOF
set -e
poetry install --only main --no-interaction --no-ansi
EOF

# Install Kaolin from NVIDIA's repository
RUN --mount=type=cache,target=/root/.cache/pip <<EOF
set -e
pip install --no-cache-dir \
    --find-links ${KAOLIN_INDEX_URL} \
    kaolin==${KAOLIN_VERSION}
EOF

# Install flash-attention
RUN --mount=type=cache,target=/root/.cache/pip <<EOF
set -e
if pip install --no-cache-dir https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; then
    echo "Flash attention CUDA 12.3 wheel installed successfully"
elif pip install --no-cache-dir flash-attn; then
    echo "Flash attention wheel installed successfully"
else
    echo "Flash attention wheel not available, attempting source build..."
    pip install --no-cache-dir flash-attention
fi
EOF

# Install diff-gaussian-rasterization (download wheel if not present locally)
RUN --mount=type=cache,target=/root/.cache/pip <<EOF
set -e
if [ -f "wheels/diff_gaussian_rasterization-*.whl" ]; then
    echo "Installing diff-gaussian-rasterization from local wheel..."
    pip install --no-cache-dir wheels/diff_gaussian_rasterization-*.whl
else
    echo "Local wheel not found, downloading from HuggingFace..."
    pip install --no-cache-dir \
        https://huggingface.co/spaces/JeffreyXiang/TRELLIS/resolve/main/wheels/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl?download=true
fi
EOF

# Build and install nvdiffrast from source (ensures CUDA compatibility)
# Set TORCH_CUDA_ARCH_LIST to compile for common GPU architectures
# Format: "compute_capability1 compute_capability2 ..."
# 7.0,7.5 = Volta/Turing (V100, RTX 2080)
# 8.0,8.6 = Ampere (A100, RTX 3090)
# 8.9 = Ada Lovelace (RTX 4090)
# 9.0 = Hopper (H100)
RUN --mount=type=cache,target=/root/.cache/pip <<EOF
set -e
cd extensions/nvdiffrast
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" pip install --no-cache-dir .
EOF

# =============================================================================
# Runtime Stage
# =============================================================================
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

# Re-declare ARGs needed in runtime stage
ARG PYTHON_VERSION
ARG APP_USER
ARG APP_UID
ARG APP_PORT
ARG CACHE_DIR
ARG HF_CACHE_DIR
ARG REMBG_CACHE_DIR
ARG TRELLIS_OUTPUT_DIR

# Environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PATH="/usr/local/bin:$PATH" \
    APP_PORT=${APP_PORT} \
    CACHE_DIR=${CACHE_DIR} \
    HF_HOME=${HF_CACHE_DIR} \
    HUGGINGFACE_HUB_CACHE=${HF_CACHE_DIR} \
    TRANSFORMERS_CACHE=${HF_CACHE_DIR} \
    U2NET_HOME=${REMBG_CACHE_DIR} \
    TRELLIS_OUTPUT_DIR=${TRELLIS_OUTPUT_DIR} \
    STREAMLIT_SERVER_ADDRESS=${STREAMLIT_SERVER_ADDRESS:-0.0.0.0} \
    STREAMLIT_SERVER_HEADLESS=${STREAMLIT_SERVER_HEADLESS:-true} \
    TRANSFORMERS_NO_ADVISORY_WARNINGS=1 \
    TOKENIZERS_PARALLELISM=false

WORKDIR /app

# Install only runtime dependencies
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked <<EOF
set -e
apt-get update
apt-get install -y --no-install-recommends \
    python${PYTHON_VERSION} \
    python3-pip \
    git \
    libgomp1
rm -rf /var/lib/apt/lists/*
EOF

# Copy Python packages from builder
COPY --from=builder /usr/local/lib/python${PYTHON_VERSION}/dist-packages /usr/local/lib/python${PYTHON_VERSION}/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Copy application code
COPY trellis/ ./trellis/
COPY extensions/ ./extensions/
COPY assets/ ./assets/
COPY app.py ./
COPY webui/ ./webui/
COPY .streamlit/ ./.streamlit/
COPY docs/ ./docs/

# Create diagnostic script
RUN <<EOF
cat > /app/cuda_diag.py << 'DIAG_EOF'
#!/usr/bin/env python3
import sys
import subprocess
import os

print("=== TRELLIS CUDA Diagnostics ===\n")

# Check nvidia-smi
print("1. NVIDIA GPU Status:")
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        print("✓ nvidia-smi available")
        # Extract GPU info
        lines = result.stdout.split('\n')
        for line in lines:
            if 'NVIDIA' in line and 'GPU' in line:
                print(f"  GPU: {line.strip()}")
                break
    else:
        print("✗ nvidia-smi failed")
        print(f"  Error: {result.stderr}")
except Exception as e:
    print(f"✗ nvidia-smi not available: {e}")

print("\n2. PyTorch CUDA Status:")
try:
    import torch
    print(f"  PyTorch version: {torch.__version__}")
    print(f"  CUDA available: {torch.cuda.is_available()}")

    if torch.cuda.is_available():
        print(f"  CUDA version: {torch.version.cuda}")
        print(f"  GPU count: {torch.cuda.device_count()}")

        for i in range(torch.cuda.device_count()):
            try:
                gpu_name = torch.cuda.get_device_name(i)
                gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
                print(f"  GPU {i}: {gpu_name} ({gpu_memory:.1f} GB)")
            except Exception as e:
                print(f"  GPU {i}: Error getting info - {e}")
    else:
        print("  Reason: PyTorch CUDA not compiled or GPU not accessible")

except ImportError:
    print("✗ PyTorch not available")

print("\n3. Environment Variables:")
cuda_vars = {k: v for k, v in os.environ.items() if 'CUDA' in k or 'NVIDIA' in k}
if cuda_vars:
    for k, v in cuda_vars.items():
        print(f"  {k}={v}")
else:
    print("  No CUDA/NVIDIA environment variables found")

print("\n4. Container GPU Access:")
if os.path.exists('/dev/nvidia0'):
    print("✓ NVIDIA device files present")
else:
    print("✗ NVIDIA device files not found")

print("\n=== Diagnostics Complete ===")
DIAG_EOF

chmod +x /app/cuda_diag.py
EOF

# Create non-root user for security
RUN <<EOF
set -e
useradd -m -u ${APP_UID} -s /bin/bash ${APP_USER}

# Create cache and output directories with proper ownership from the start
mkdir -p ${CACHE_DIR} ${HF_CACHE_DIR} ${REMBG_CACHE_DIR} ${TRELLIS_OUTPUT_DIR}
chown -R ${APP_USER}:${APP_USER} ${CACHE_DIR} ${HF_CACHE_DIR} ${REMBG_CACHE_DIR} ${TRELLIS_OUTPUT_DIR}
chmod -R 777 ${CACHE_DIR} ${HF_CACHE_DIR} ${REMBG_CACHE_DIR} ${TRELLIS_OUTPUT_DIR}

# Set ownership of /app after creating directories
chown -R ${APP_USER}:${APP_USER} /app

# Ensure directories remain writable even after volume mounts
chmod 777 ${TRELLIS_OUTPUT_DIR}
EOF

USER ${APP_USER}

# Expose Streamlit port
EXPOSE ${APP_PORT}

# Add healthcheck
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
    CMD python3 -c "import requests; requests.get('http://localhost:${APP_PORT}/_stcore/health')" || exit 1

# Command to run the Streamlit app
CMD streamlit run app.py \
    --server.port=${APP_PORT} \
    --server.address=${STREAMLIT_SERVER_ADDRESS:-0.0.0.0} \
    --server.headless=${STREAMLIT_SERVER_HEADLESS:-true}