gatk-3.8/shell/firehose/pipelineTsvToYaml.sh

168 lines
5.3 KiB
Bash
Executable File

#!/bin/sh
# Uses awk to generate a YAML file from a TSV.
# In the awk script and templates:
# - Variables starting with a '$' are columns in the TSV
# - Variables without a '$' are pre-calculated from the first row of data
# TSV file to read
PIPELINE_TSV_FILE=$1
if [ "$PIPELINE_TSV_FILE" == "" ]; then
echo "Usage: $0 <Set_Name>.tsv" >&2
exit 1
fi
ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`)
if [[ ${ROW_COUNT[0]} -lt 2 ]]; then
echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2
exit 1
fi
# YAML file to write
PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml
# YAML templates
# Project YAML template, once per file.
PROJECT_YAML_TEMPLATE='"\n\
project: {\n\
name: %s,\n\
referenceFile: %s,\n\
genotypeDbsnp: %s,\n\
evalDbsnp: %s,\n\
refseqTable: %s,\n\
intervalList: %s\n\
},", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList'
# Project YAML template, once per sample.
SAMPLE_YAML_TEMPLATE='"\n\
{\n\
id: %s,\n\
bamFiles: { cleaned: %s },\n\
tags: {\n\
SQUIDProject: %s,\n\
CollaboratorID: %s\n\
}\n\
}", $sampleId, $bamFile, $squidProject, $collaboratorId'
TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
# Strip the extra \n from the lines if awk of \n is
# a newline and not the two characters slash-n (on mac)
PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
fi
# Generate yaml from tsv
awk '
{
if (NR == 1) {
tsvFile = "'$PIPELINE_TSV_FILE'"
# Set the project name to the TSV file minus the directory and the .tsv
projectName = tsvFile
sub(/\/.*\//, "", projectName)
sub(/\.tsv/, "", projectName)
# Read the column headers and figure out the index of each column name.
for (i=1; i<=NF; i++)
columnFields[tolower($i)] = i
referenceFile = columnFields["reference_file"]
intervalList = columnFields["interval_list"]
sampleId = columnFields["sample_id"]
squidProject = columnFields["squid_project"]
collaboratorId = columnFields["collaborator_id"]
for (key in columnFields)
if (key ~ "bam_file")
bamFile = columnFields[key]
if (referenceFile == "") {
print "ERROR: Column header reference_file missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (intervalList == "") {
print "ERROR: Column header interval_list missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (sampleId == "") {
print "ERROR: Column header sample_id missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (squidProject == "") {
print "ERROR: Column header squid_project missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (collaboratorId == "") {
print "ERROR: Column header collaborator_id missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (bamFile == "") {
print "ERROR: Column header *bam_file* missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (exitWithError) {
exit 1
}
refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"
dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/"
# add hg18 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt"
# add hg19 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf"
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod"
refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt"
printf "{"
} else {
missingValue = 0
if ($referenceFile == "") missingValue = 1
if ($intervalList == "") missingValue = 1
if ($sampleId == "") missingValue = 1
if ($squidProject == "") missingValue = 1
if ($collaboratorId == "") missingValue = 1
if ($bamFile == "") missingValue = 1
if (missingValue) {
print "WARNING: Skipping row which does not have all values: " $0 > "/dev/stderr"
} else {
if (NR == 2) {
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
referencePartCount = split($referenceFile, referenceParts, "/")
referenceName = referenceParts[referencePartCount]
genotypeDbsnp = genotypeDbsnps[referenceName]
evalDbsnp = evalDbsnps[referenceName]
refseq = refseqs[referenceName]
printf '"$PROJECT_YAML_TEMPLATE"'
printf "\n samples: ["
} else {
printf ","
}
printf '"$SAMPLE_YAML_TEMPLATE"'
}
}
}
END {
if (NR > 0)
printf "\n ]"
print "\n}"
}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE"