Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions snpcheck/id_snpcaller_vcfmerge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,20 +192,25 @@ tumor_col_awk=$((tumor_col + 1))
na_col_awk=$((na_col + 1))
ref_col_awk=$((ref_col + 1))

## Step 1: Copy snp genotype (na) col to tumor if tumor col no variant call
awk -v tum="$tumor_col_awk" -v na="$na_col_awk" 'BEGIN { OFS="\t" }
## Step 1: Copy snp genotype (na) col to tumor if no variant call in tumor col
## + replace remaining ./.'s with default 0/1 (for germline variants)
awk -v tum="$tumor_col_awk" -v na="$na_col_awk" -v ref="$ref_col_awk" 'BEGIN { OFS="\t" }
/^##/ { print; next }
/^#CHROM/ { print; next }
{
if ($tum == "./." && $na != "./." && $na != "") {
$tum = $na
}
else if (index($tum, "./.") > 0) {
gsub(/\.\/\./, "0/1", $tum)
}

print
}
' "$HOME/${MERGED_OUTPUT_VCF}" > "$HOME/temp_with_na_corrected.vcf"

## Step 2: Remove ref and NA cols
awk -v ref="$ref_col_awk" -v na="$na_col_awk" 'BEGIN { OFS="\t" }
awk -v ref="$ref_col_awk" -v na="$na_col_awk" -v filter="$filter_col_awk" 'BEGIN { OFS="\t" }
/^##/ { print; next }
/^#CHROM/ {
for (i = 1; i <= NF; i++) {
Expand All @@ -216,6 +221,7 @@ awk -v ref="$ref_col_awk" -v na="$na_col_awk" 'BEGIN { OFS="\t" }
next
}
{
$filter="PASS"
for (i = 1; i <= NF; i++) {
if (i != ref && i != na) {
printf "%s%s", $i, (i == NF || (i+1 == ref || i+1 == na) ? ORS : OFS)
Expand Down
145 changes: 145 additions & 0 deletions snpcheck/id_snpcaller_vcfmerge_GATK.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/bin/bash

set -euo pipefail

LOCAL_INPUT_DIR="$HOME/input_files"
mkdir -p "$LOCAL_INPUT_DIR"
exec > "$LOCAL_INPUT_DIR/snpcaller_merge.log" 2>&1

setname="$1"
INPUT_DIR="gs://wgs-combined-snps-vcfs/${setname}/inputfiles"
SAMPLE_BARCODE=$(gsutil cat "$INPUT_DIR/sample_barcode.txt")
CONVERTED_REPORTING_ID=$(gsutil cat "$INPUT_DIR/converted_reporting_id.txt")
OUTPUT_BUCKET_NAME=$(gsutil cat "$INPUT_DIR/output_bucket.txt")

# define BAM_TUM path
BAM_TUM="gs://diagnostic-pipeline-output-prod-1/${setname}/${converted_reporting_id}/aligner/${converted_reporting_id}.bam"
REFERENCE="gs://common-resources/reference_genome/37/Homo_sapiens.GRCh37.GATK.illumina.fasta"

# Copy input files to local
gsutil cp "$INPUT_DIR/id_snps_intervals.hg37.bed" "$LOCAL_INPUT_DIR/id_snps_intervals.hg37.bed"
gsutil cp "$INPUT_DIR/hartwig_snpfile_tum.vcf" "$LOCAL_INPUT_DIR/hartwig_snpfile_tum.vcf"
gsutil cp "$INPUT_DIR/*.purple.germline.vcf.gz" "$LOCAL_INPUT_DIR/"
gsutil cp "$INPUT_DIR/*.purple.somatic.vcf.gz" "$LOCAL_INPUT_DIR/"
gsutil cp "$INPUT_DIR/GenomeAnalysisTK.jar" "$LOCAL_INPUT_DIR/GenomeAnalysisTK.jar"

# Create input file vars
INTERVALS="$LOCAL_INPUT_DIR/id_snps_intervals.hg37.bed"
SNP_VCF_TUM="$LOCAL_INPUT_DIR/hartwig_snpfile_tum.vcf"
GERMLINE_VCF=$(ls $LOCAL_INPUT_DIR/*.purple.germline.vcf.gz | head -n 1)
SOMATIC_VCF=$(ls $LOCAL_INPUT_DIR/*.purple.somatic.vcf.gz | head -n 1)
GATK="$LOCAL_INPUT_DIR/GenomeAnalysisTK.jar"
JAVA=#location java

# Create output file name vars
SNP_OUTPUT_VCF="${LOCAL_INPUT_DIR}/${SAMPLE_BARCODE}_snp_genotype_output.vcf"
MERGED_OUTPUT_VCF="${LOCAL_INPUT_DIR}/${SAMPLE_BARCODE}_merged.vcf"
FINAL_OUTPUT_VCF="${LOCAL_INPUT_DIR}/${CONVERTED_REPORTING_ID}.reported.variants.and.snps.vcf"

# GATK HaplotypeCaller (UnifiedGenotyper)
$JAVA -Xmx20G -jar "$GATK" \
-T UnifiedGenotyper \
-nct $(nproc) \
--input_file "$BAM_TUM" \
-o "$SNP_OUTPUT_VCF" \
-L "$INTERVALS" \
--reference_sequence "$REFERENCE" \
--output_mode EMIT_ALL_SITES

# Filter somatic/germline VCFs
zcat "$SOMATIC_VCF" | grep -E '^#|REPORTED' > "${LOCAL_INPUT_DIR}/${SAMPLE_BARCODE}_somatic_filtered.vcf"
zcat "$GERMLINE_VCF" | grep -E '^#|REPORTED' > "${LOCAL_INPUT_DIR}/${SAMPLE_BARCODE}_germline_filtered.vcf"

# CombineVariants
$JAVA -jar "$GATK" \
-T CombineVariants \
-R "$REFERENCE" \
--variant "${LOCAL_INPUT_DIR}/${SAMPLE_BARCODE}_somatic_filtered.vcf" \
--variant "${LOCAL_INPUT_DIR}/${SAMPLE_BARCODE}_germline_filtered.vcf" \
--variant "$SNP_OUTPUT_VCF" \
--variant "$SNP_VCF_TUM" \
-o "$MERGED_OUTPUT_VCF" \
-genotypeMergeOptions UNSORTED

# Copy NA column to tumor if missing, and clean up VCF

header_line=$(grep -m 1 '^#CHROM' "$MERGED_OUTPUT_VCF")
IFS=$'\t' read -r -a headers <<< "$header_line"

tumor_col=-1
na_col=-1
ref_col=-1
filter_col=-1

# Identify columns
for i in "${!headers[@]}"; do
col="${headers[$i]}"
[[ "$col" == "NA" ]] && na_col=$i
[[ "$col" == *"-ref" ]] && ref_col=$i
[[ "$col" == "FILTER" ]] && filter_col=$i
[[ "$col" != "FORMAT" && ! "$col" =~ ^#?(CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO)$ && "$col" != "NA" && "$col" != *"-ref" ]] && tumor_col=$i
done

# Validate
if [[ $tumor_col -eq -1 || $na_col -eq -1 ]]; then
echo "Error: Could not find both tumor and NA columns in the merged VCF."
exit 1
fi

# Convert to 1-based for awk
(( tumor_col_awk = tumor_col + 1 ))
(( na_col_awk = na_col + 1 ))
(( ref_col_awk = ref_col + 1 ))
(( filter_col_awk = filter_col + 1 ))

# Step 1: Copy NA genotype to tumor if missing, replace remaining ./.
awk -v tum="$tumor_col_awk" -v na="$na_col_awk" 'BEGIN { OFS="\t" }
/^##/ { print; next }
/^#CHROM/ { print; next }
{
if ($tum == "./." && $na != "./." && $na != "") {
$tum = $na
} else if (index($tum, "./.") > 0) {
gsub(/\.\/\./, "0/1", $tum)
}
print
}
' "$MERGED_OUTPUT_VCF" > "${LOCAL_INPUT_DIR}/temp_with_na_corrected.vcf"

# Step 2: Remove ref and NA columns, set FILTER to PASS
awk -v ref="$ref_col_awk" -v na="$na_col_awk" '
BEGIN { OFS="\t" }

/^##/ { print; next }

/^#CHROM/ {
out=""
for (i=1; i<=NF; i++)
if (i!=ref && i!=na)
out = out (out ? OFS : "") $i
print out
next
}

{
out=""
for (i=1; i<=NF; i++)
if (i!=ref && i!=na)
out = out (out ? OFS : "") $i
print out
}
' temp_with_na_corrected.vcf > final.vcf

# Step 3: Upload final VCF
gsutil cp "${FINAL_OUTPUT_VCF}" "gs://${OUTPUT_BUCKET_NAME}/${setname}/${FINAL_OUTPUT_VCF}" || {
echo "Error: Failed to upload VCF to output bucket"
exit 1
}

gsutil cp "$LOCAL_INPUT_DIR/snpcaller_merge.log" "gs://${OUTPUT_BUCKET_NAME}/${setname}/"

gsutil rm -r "${INPUT_DIR}"
gsutil rm -r "${LOCAL_INPUT_DIR}"

echo "Final VCF copied to output bucket: gs://${OUTPUT_BUCKET_NAME}/${setname}/${FINAL_OUTPUT_VCF} "

84 changes: 84 additions & 0 deletions snpcheck/id_snpcaller_vcfmerge_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

# Usage check, only permit 1 or 3 arguments
if [ "$#" -lt 1 ] || [ "$#" -gt 3 ] || [ "$#" -eq 2 ]; then
echo "Usage: <setname> <input-bucket-name> <output-bucket-name>"
echo "Ex: 123456_HMFregCORE_FS12345678_CORE0100000 research-pipeline-output-prod-1 example-output-bucket"
echo "Input and output bucket-names are optional, defaults to diagnostic-pipeline-output-prod-1 and wgs-combined-snps-vcfs (used in production process)"
echo "NOTE: When specifying a bucket, both input and output buckets must be provided."
exit 1
else
# Redirect output to log only after usage validated
INPUT_DIR="$HOME/inputfiles"
mkdir INPUT_DIR
exec > "$INPUT_DIR/prepare_inputs.log" 2>&1
set -euo pipefail
fi

setname=$1
DEFAULT_BUCKET="diagnostic-pipeline-output-prod-1"
BUCKET_NAME=${2:-$DEFAULT_BUCKET}

DEFAULT_OUTPUT_BUCKET="wgs-combined-snps-vcfs"
OUTPUT_BUCKET_NAME=${3:-$DEFAULT_OUTPUT_BUCKET}

MOUNT_POINT_BAM="$HOME/testdir/"
mkdir -p "$INPUT_DIR"

# Get barcodes
ISOLATION_BARCODE=$(echo "$setname" | cut -d'_' -f4)
SAMPLE_BARCODE=$(lama_get_patient_reporter_data "${ISOLATION_BARCODE}" | jq .tumorSampleBarcode | tr -d '"')
HOSPITAL_SAMPLE_LABEL=$(lama_get_patient_reporter_data "${ISOLATION_BARCODE}" | jq -r .hospitalSampleLabel)
REPORTING_ID=$(lama_get_patient_reporter_data "${ISOLATION_BARCODE}" | jq -r .reportingId)

if [[ -n "$HOSPITAL_SAMPLE_LABEL" && "$HOSPITAL_SAMPLE_LABEL" != "null" ]]; then
CONVERTED_REPORTING_ID="${REPORTING_ID}-${HOSPITAL_SAMPLE_LABEL}"
else
CONVERTED_REPORTING_ID="${REPORTING_ID}"
fi

# Mount buckets
mkdir -p "${MOUNT_POINT_BAM}"

fusermount -u "${MOUNT_POINT_BAM}" 2>/dev/null || true

gcsfuse --implicit-dirs "${BUCKET_NAME}" "${MOUNT_POINT_BAM}"

# SNP intervals
cp "/data/resources/reporting-resources/snps/id_snps_intervals.hg37.bed" "$INPUT_DIR/id_snps_intervals.hg37.bed"

# Copy hartwig SNP VCF
ALL_SNP_VCFS=($(find "${MOUNT_POINT_BAM}${setname}" -type f -path "*/snp_genotype/*output.vcf"))

if [ ${#ALL_SNP_VCFS[@]} -eq 0 ]; then
echo "Warning: Geen SNP VCF bestanden gevonden in ${MOUNT_POINT_BAM}${setname}" >&2
fi

for SNP_VCF_PATH in "${ALL_SNP_VCFS[@]}"; do
if [[ "$SNP_VCF_PATH" != *-ref* ]]; then
cp "$SNP_VCF_PATH" "$INPUT_DIR/hartwig_snpfile_tum.vcf"
fi
done



# Purple VCFs
cp "${MOUNT_POINT_BAM}${setname}/purple/"*.purple.germline.vcf.gz "$INPUT_DIR/"
cp "${MOUNT_POINT_BAM}${setname}/purple/"*.purple.somatic.vcf.gz "$INPUT_DIR/"

# Save metadata
echo "$SAMPLE_BARCODE" > "$INPUT_DIR/sample_barcode.txt"
echo "$CONVERTED_REPORTING_ID" > "$INPUT_DIR/converted_reporting_id.txt"
echo "$OUTPUT_BUCKET_NAME" > "$INPUT_DIR/output_bucket.txt"
cp "/data/tools/gatk/3.8.0/GenomeAnalysisTK.jar" "$INPUT_DIR/GenomeAnalysisTK.jar"

gsutil cp -r "${INPUT_DIR}" "gs://${OUTPUT_BUCKET_NAME}/${setname}/"
gsutil cp "$INPUT_DIR/prepare_inputs.log" "gs://${OUTPUT_BUCKET_NAME}/${setname}/"

# Unmount & cleanup
fusermount -u "${MOUNT_POINT_BAM}"
rm -r "${MOUNT_POINT_BAM}"
rm -r "${INPUT_DIR}"

echo "Done. Files prepared in: gs://${OUTPUT_BUCKET_NAME}/${setname}/"