157 lines
4.8 KiB
Bash
Executable File
157 lines
4.8 KiB
Bash
Executable File
#!/bin/sh
|
|
|
|
# Uses awk to generate a YAML file from a TSV.
|
|
|
|
# In the awk script and templates:
|
|
# - Variables starting with a '$' are columns in the TSV
|
|
# - Variables without a '$' are pre-calculated from the first row of data
|
|
|
|
|
|
# TSV file to read
|
|
PIPELINE_TSV_FILE=$1
|
|
|
|
if [ "$PIPELINE_TSV_FILE" == "" ]; then
|
|
echo "Usage: $0 <Set_Name>.tsv" >&2
|
|
exit 1
|
|
fi
|
|
|
|
ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`)
|
|
if [[ ${ROW_COUNT[0]} -lt 2 ]]; then
|
|
echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# YAML file to write
|
|
PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml
|
|
|
|
# YAML templates
|
|
|
|
# Project YAML template, once per file.
|
|
PROJECT_YAML_TEMPLATE='"\n\
|
|
project: {\n\
|
|
name: %s,\n\
|
|
referenceFile: %s,\n\
|
|
genotypeDbsnp: %s,\n\
|
|
evalDbsnp: %s,\n\
|
|
refseqTable: %s,\n\
|
|
intervalList: %s\n\
|
|
},", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList'
|
|
|
|
# Project YAML template, once per sample.
|
|
SAMPLE_YAML_TEMPLATE='"\n\
|
|
{\n\
|
|
id: %s,\n\
|
|
bamFiles: { cleaned: %s },\n\
|
|
tags: {\n\
|
|
SQUIDProject: %s,\n\
|
|
CollaboratorID: %s\n\
|
|
}\n\
|
|
}", $sampleId, $bamFile, $squidProject, $collaboratorId'
|
|
|
|
TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
|
|
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
|
|
# Strip the extra \n from the lines if awk of \n is
|
|
# a newline and not the two characters slash-n (on mac)
|
|
PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
|
|
SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
|
|
fi
|
|
|
|
# Generate yaml from tsv
|
|
awk '
|
|
{
|
|
if (NR == 1) {
|
|
tsvFile = "'$PIPELINE_TSV_FILE'"
|
|
|
|
# Set the project name to the TSV file minus the directory and the .tsv
|
|
projectName = tsvFile
|
|
sub(/\/.*\//, "", projectName)
|
|
sub(/\.tsv/, "", projectName)
|
|
|
|
# Read the column headers and figure out the index of each column name.
|
|
for (i=1; i<=NF; i++)
|
|
columnFields[tolower($i)] = i
|
|
|
|
referenceFile = columnFields["reference_file"]
|
|
intervalList = columnFields["interval_list"]
|
|
sampleId = columnFields["sample_id"]
|
|
squidProject = columnFields["squid_project"]
|
|
collaboratorId = columnFields["collaborator_id"]
|
|
|
|
for (key in columnFields)
|
|
if (key ~ "bam_file")
|
|
bamFile = columnFields[key]
|
|
|
|
if (referenceFile == "") {
|
|
print "Column header reference_file missing from " tsvFile > "/dev/stderr"
|
|
exitWithError = 1
|
|
}
|
|
|
|
if (intervalList == "") {
|
|
print "Column header interval_list missing from " tsvFile > "/dev/stderr"
|
|
exitWithError = 1
|
|
}
|
|
|
|
if (sampleId == "") {
|
|
print "Column header sample_id missing from " tsvFile > "/dev/stderr"
|
|
exitWithError = 1
|
|
}
|
|
|
|
if (squidProject == "") {
|
|
print "Column header squid_project missing from " tsvFile > "/dev/stderr"
|
|
exitWithError = 1
|
|
}
|
|
|
|
if (collaboratorId == "") {
|
|
print "Column header collaborator_id missing from " tsvFile > "/dev/stderr"
|
|
exitWithError = 1
|
|
}
|
|
|
|
if (bamFile == "") {
|
|
print "Column header *bam_file* missing from " tsvFile > "/dev/stderr"
|
|
exitWithError = 1
|
|
}
|
|
|
|
if (exitWithError) {
|
|
exit 1
|
|
}
|
|
|
|
|
|
refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"
|
|
dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/"
|
|
|
|
# add hg18 specific files to awk associative arrays
|
|
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
|
|
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
|
|
refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt"
|
|
|
|
# add hg19 specific files to awk associative arrays
|
|
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf"
|
|
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod"
|
|
refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt"
|
|
|
|
printf "{"
|
|
} else {
|
|
if (NR == 2) {
|
|
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
|
|
|
|
referencePartCount = split($referenceFile, referenceParts, "/")
|
|
referenceName = referenceParts[referencePartCount]
|
|
|
|
genotypeDbsnp = genotypeDbsnps[referenceName]
|
|
evalDbsnp = evalDbsnps[referenceName]
|
|
refseq = refseqs[referenceName]
|
|
|
|
printf '"$PROJECT_YAML_TEMPLATE"'
|
|
printf "\n samples: ["
|
|
} else {
|
|
printf ","
|
|
}
|
|
printf '"$SAMPLE_YAML_TEMPLATE"'
|
|
}
|
|
}
|
|
END {
|
|
if (NR > 0)
|
|
printf "\n ]"
|
|
print "\n}"
|
|
}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE"
|