gatk-3.8/shell/firehose/pipelineTsvToYaml.sh

#!/bin/sh

# Uses awk to generate a YAML file from a TSV.

# In the awk script and templates:
#   - Variables starting with a '$' are columns in the TSV
#   - Variables without a '$' are pre-calculated from the first row of data


# TSV file to read
PIPELINE_TSV_FILE=$1

if [ "$PIPELINE_TSV_FILE" == "" ]; then
    echo "Usage: $0 <Set_Name>.tsv" >&2
    exit 1
fi

ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`)
if [[ ${ROW_COUNT[0]} -lt 2 ]]; then
    echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2
    exit 1
fi

# YAML file to write
PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml

# YAML templates

# Project YAML template, once per file.
PROJECT_YAML_TEMPLATE='"\n\
  project: {\n\
    name: %s,\n\
    referenceFile: %s,\n\
    genotypeDbsnp: %s,\n\
    evalDbsnp: %s,\n\
    refseqTable: %s,\n\
    intervalList: %s\n\
  },", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList'

# Project YAML template, once per sample.
SAMPLE_YAML_TEMPLATE='"\n\
    {\n\
      id: %s,\n\
      bamFiles: { cleaned: %s },\n\
      tags: {\n\
        SQUIDProject: %s,\n\
        CollaboratorID: %s\n\
      }\n\
    }", $sampleId, $bamFile, $squidProject, $collaboratorId'

TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
    # Strip the extra \n from the lines if awk of \n is
    # a newline and not the two characters slash-n (on mac)
    PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
    SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
fi

# Generate yaml from tsv
awk '
{
    if (NR == 1) {
        tsvFile = "'$PIPELINE_TSV_FILE'"

        # Set the project name to the TSV file minus the directory and the .tsv
        projectName = tsvFile
        sub(/\/.*\//, "", projectName)
        sub(/\.tsv/, "", projectName)

        # Read the column headers and figure out the index of each column name.
        for (i=1; i<=NF; i++)
            columnFields[tolower($i)] = i

        referenceFile = columnFields["reference_file"]
        intervalList = columnFields["interval_list"]
        sampleId = columnFields["sample_id"]
        squidProject = columnFields["squid_project"]
        collaboratorId = columnFields["collaborator_id"]

        for (key in columnFields)
            if (key ~ "bam_file")
                bamFile = columnFields[key]

        if (referenceFile == "") {
            print "Column header reference_file missing from " tsvFile > "/dev/stderr"
            exitWithError = 1
        }

        if (intervalList == "") {
            print "Column header interval_list missing from " tsvFile > "/dev/stderr"
            exitWithError = 1
        }

        if (sampleId == "") {
            print "Column header sample_id missing from " tsvFile > "/dev/stderr"
            exitWithError = 1
        }

        if (squidProject == "") {
            print "Column header squid_project missing from " tsvFile > "/dev/stderr"
            exitWithError = 1
        }

        if (collaboratorId == "") {
            print "Column header collaborator_id missing from " tsvFile > "/dev/stderr"
            exitWithError = 1
        }

        if (bamFile == "") {
            print "Column header *bam_file* missing from " tsvFile > "/dev/stderr"
            exitWithError = 1
        }

        if (exitWithError) {
            exit 1
        }


        refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"
        dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/"

        # add hg18 specific files to awk associative arrays
        genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
        evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
        refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt"

        # add hg19 specific files to awk associative arrays
        genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf"
        evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod"
        refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt"

        printf "{"
    } else {
        if (NR == 2) {
            # Based on the reference of the first sample, specify the dbsnps and refseq tables.

            referencePartCount = split($referenceFile, referenceParts, "/")
            referenceName = referenceParts[referencePartCount]

            genotypeDbsnp = genotypeDbsnps[referenceName]
            evalDbsnp = evalDbsnps[referenceName]
            refseq = refseqs[referenceName]

            printf '"$PROJECT_YAML_TEMPLATE"'
            printf "\n  samples: ["
        } else {
            printf ","
        }
        printf '"$SAMPLE_YAML_TEMPLATE"'
    }
}
END {
    if (NR > 0)
        printf "\n  ]"
    print "\n}"
}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE"