From 24ef2be02d0bb35a207e5e80855ff8a51055a4eb Mon Sep 17 00:00:00 2001
From: kshakir <kshakir@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 25 Feb 2011 00:23:05 +0000
Subject: [PATCH] Updated firehose pulldown shell scripts: - a LOT more error
 reporting to stderr and exit codes - split the firehose pull down into a TSV
 generators and a TSV to YAML converter - YAML converter is compatible with
 the TSVs generated by the front end website and will grab only the
 appropriate columns - deprecated getFirehosePipelineYaml.sh mode with a
 single Sample_Set name which uses the Firehose test harness - new
 getFirehosePipelineYamle.sh mode using web services API and requires an
 additional parameter, a password config file with "-u <user>:<pass>" which
 has been tested on problematic Sample_Sets

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5313 348d0f76-0448-11de-a6fe-93d51630548a
---
 shell/firehose/getFirehoseCurlTsv.sh      |  61 +++++++++
 shell/firehose/getFirehosePipelineYaml.sh |  10 ++
 shell/firehose/getFirehoseTestTsv.sh      |  68 ++++++++++
 shell/firehose/pipelineTsvToYaml.sh       | 156 ++++++++++++++++++++++
 shell/getFirehosePipelineYaml.sh          | 138 -------------------
 5 files changed, 295 insertions(+), 138 deletions(-)
 create mode 100755 shell/firehose/getFirehoseCurlTsv.sh
 create mode 100755 shell/firehose/getFirehosePipelineYaml.sh
 create mode 100755 shell/firehose/getFirehoseTestTsv.sh
 create mode 100755 shell/firehose/pipelineTsvToYaml.sh
 delete mode 100755 shell/getFirehosePipelineYaml.sh
diff --git a/shell/firehose/getFirehoseCurlTsv.sh b/shell/firehose/getFirehoseCurlTsv.sh
new file mode 100755
index 000000000..bd0acec26
--- /dev/null
+++ b/shell/firehose/getFirehoseCurlTsv.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+
+# Downloads a set of samples from Firehose using the Firehose API and generate a TSV file with the outputs.
+# see: http://iwww.broadinstitute.org/cancer/cga/wiki/index.php/GetAnnotations
+
+ENTITY_SET_ID=$1
+ENTITY_SET_TYPE=Sample_Set
+ENTITY_TYPE=Sample
+PASSWORD_FILE=$2
+
+if [ "$ENTITY_SET_ID" == "" ]; then
+    EXIT_USAGE=1
+fi
+
+if [ "$PASSWORD_FILE" == "" ]; then
+    echo 'Missing password file with the contents: "-u <user>:<pass>"' >&2
+    EXIT_USAGE=1
+fi
+
+if [ $EXIT_USAGE ]; then
+    echo "Usage: $0 <Sample_Set_Name> <Curl_Password_File>" >&2
+    exit 1
+fi
+
+# Firehose variables
+
+FIREHOSE_HOST=firehose
+FIREHOSE_PORT=8080
+FIREHOSE_DOMAIN=gsa
+FIREHOSE_WORKSPACE=trunk
+
+# TSV file to write
+
+PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv
+
+# Annotations to pull down from Firehose
+
+FIREHOSE_ANNOTATIONS=(reference_file interval_list recalibrated_bam_file squid_project collaborator_id)
+
+index=0
+count=${#FIREHOSE_ANNOTATIONS[@]}
+FIREHOSE_VARIABLES=""
+
+# Build the tab separated list of firehose arguments
+
+while [ "$index" -lt "$count" ]; do
+    FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'&annotationTypes='${FIREHOSE_ANNOTATIONS[$index]}
+    let "index = $index + 1"
+done
+
+curl --fail -sL -K "$PASSWORD_FILE" -o "$PIPELINE_TSV_FILE" \
+    "http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN/ws/entity/getAnnotations/$ENTITY_TYPE?entityNames=$ENTITY_SET_ID&filterSetType=$ENTITY_SET_TYPE&workspaceName=$FIREHOSE_WORKSPACE$FIREHOSE_VARIABLES" || \
+
+EXIT_CODE=$?
+
+if [[ $EXIT_CODE -ne 0 ]]; then
+    echo "curl failed with exit code:" $EXIT_CODE >&2
+    echo 'Check the name of your Sample_Set and that your password file '$PASSWORD_FILE' is setup correctly with: "-u <user>:<pass>"' >&2
+    echo "If that doesn't work make sure you can login to the firehose website: http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN" >&2
+    exit $EXIT_CODE
+fi
diff --git a/shell/firehose/getFirehosePipelineYaml.sh b/shell/firehose/getFirehosePipelineYaml.sh
new file mode 100755
index 000000000..604affcd3
--- /dev/null
+++ b/shell/firehose/getFirehosePipelineYaml.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+# Downloads a set of samples from Firehose and generates a YAML file.
+
+DIR=`dirname $0`
+if [ "$2" == "" ]; then
+    $DIR/getFirehoseTestTsv.sh $1 && $DIR/pipelineTsvToYaml.sh $1.tsv
+else
+    $DIR/getFirehoseCurlTsv.sh $1 $2 && $DIR/pipelineTsvToYaml.sh $1.tsv
+fi
diff --git a/shell/firehose/getFirehoseTestTsv.sh b/shell/firehose/getFirehoseTestTsv.sh
new file mode 100755
index 000000000..1fdb4335a
--- /dev/null
+++ b/shell/firehose/getFirehoseTestTsv.sh
@@ -0,0 +1,68 @@
+#!/bin/sh
+
+# Downloads a set of samples from Firehose using the obsolete Firehose Test Harness and generate a TSV file with the outputs.
+
+ENTITY_SET_ID=$1
+ENTITY_SET_TYPE=Sample_Set
+ENTITY_TYPE=Sample
+
+if [ "$ENTITY_SET_ID" == "" ]; then
+    echo "Usage: $0 <Sample_Set_Name>" >&2
+    exit 1
+fi
+
+# Firehose variables
+
+FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source
+CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis
+FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py"
+FIREHOSE_HOST=firehose
+FIREHOSE_PORT=8080
+FIREHOSE_DOMAIN=gsa
+FIREHOSE_WORKSPACE=trunk
+
+# TSV file to write
+
+PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv
+
+# Annotations to pull down from Firehose
+
+FIREHOSE_ANNOTATIONS=(reference_file interval_list sample_id recalibrated_bam_file squid_project collaborator_id)
+
+index=0
+count=${#FIREHOSE_ANNOTATIONS[@]}
+TSV_HEADER=""
+FIREHOSE_VARIABLES=""
+TAB='	'
+
+# Build the tab separated list of firehose arguments
+
+while [ "$index" -lt "$count" ]; do
+    if [ "$FIREHOSE_VARIABLES" != "" ]; then
+        FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB
+        TSV_HEADER=$TSV_HEADER$TAB
+    fi
+    FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}'
+    TSV_HEADER=$TSV_HEADER${FIREHOSE_ANNOTATIONS[$index]}
+    let "index = $index + 1"
+done
+
+# Retrieve all the required variables via the test harness.
+$FIREHOSE_TEST_HARNESS \
+    -d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \
+    -t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \
+    "echo '$FIREHOSE_VARIABLES'" && \
+\
+# Generate tsv header
+echo "$TSV_HEADER" > $PIPELINE_TSV_FILE \
+# Generate tsv from firehose output
+. firehose-populated-commands.sh >> $PIPELINE_TSV_FILE
+
+EXIT_CODE=$?
+
+if [[ $EXIT_CODE -ne 0 ]]; then
+    echo "" >&2
+    echo "The Firehose test harness failed with exit code:" $EXIT_CODE >&2
+    echo 'Check the name of your Sample_Set or try using the newer getFirehoseCurlTsv.sh' >&2
+    exit $EXIT_CODE
+fi
diff --git a/shell/firehose/pipelineTsvToYaml.sh b/shell/firehose/pipelineTsvToYaml.sh
new file mode 100755
index 000000000..8582ec822
--- /dev/null
+++ b/shell/firehose/pipelineTsvToYaml.sh
@@ -0,0 +1,156 @@
+#!/bin/sh
+
+# Uses awk to generate a YAML file from a TSV.
+
+# In the awk script and templates:
+#   - Variables starting with a '$' are columns in the TSV
+#   - Variables without a '$' are pre-calculated from the first row of data
+
+
+# TSV file to read
+PIPELINE_TSV_FILE=$1
+
+if [ "$PIPELINE_TSV_FILE" == "" ]; then
+    echo "Usage: $0 <Set_Name>.tsv" >&2
+    exit 1
+fi
+
+ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`)
+if [[ ${ROW_COUNT[0]} -lt 2 ]]; then
+    echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2
+    exit 1
+fi
+
+# YAML file to write
+PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml
+
+# YAML templates
+
+# Project YAML template, once per file.
+PROJECT_YAML_TEMPLATE='"\n\
+  project: {\n\
+    name: %s,\n\
+    referenceFile: %s,\n\
+    genotypeDbsnpFile: %s,\n\
+    evalDbsnpFile: %s,\n\
+    refseqTable: %s,\n\
+    intervalList: %s\n\
+  },", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList'
+
+# Project YAML template, once per sample.
+SAMPLE_YAML_TEMPLATE='"\n\
+    {\n\
+      id: %s,\n\
+      bamFiles: { cleaned: %s },\n\
+      tags: {\n\
+        SQUIDProject: %s,\n\
+        CollaboratorID: %s\n\
+      }\n\
+    }", $sampleId, $bamFile, $squidProject, $collaboratorId'
+
+TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
+if [ "$TEST_AWK_COUNT" -eq 2 ]; then
+    # Strip the extra \n from the lines if awk of \n is
+    # a newline and not the two characters slash-n (on mac)
+    PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
+    SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
+fi
+
+# Generate yaml from tsv
+awk '
+{
+    if (NR == 1) {
+        tsvFile = "'$PIPELINE_TSV_FILE'"
+
+        # Set the project name to the TSV file minus the directory and the .tsv
+        projectName = tsvFile
+        sub(/\/.*\//, "", projectName)
+        sub(/\.tsv/, "", projectName)
+
+        # Read the column headers and figure out the index of each column name.
+        for (i=1; i<=NF; i++)
+            columnFields[tolower($i)] = i
+
+        referenceFile = columnFields["reference_file"]
+        intervalList = columnFields["interval_list"]
+        sampleId = columnFields["sample_id"]
+        squidProject = columnFields["squid_project"]
+        collaboratorId = columnFields["collaborator_id"]
+
+        for (key in columnFields)
+            if (key ~ "bam_file")
+                bamFile = columnFields[key]
+
+        if (referenceFile == "") {
+            print "Column header reference_file missing from " tsvFile > "/dev/stderr"
+            exitWithError = 1
+        }
+
+        if (intervalList == "") {
+            print "Column header interval_list missing from " tsvFile > "/dev/stderr"
+            exitWithError = 1
+        }
+
+        if (sampleId == "") {
+            print "Column header sample_id missing from " tsvFile > "/dev/stderr"
+            exitWithError = 1
+        }
+
+        if (squidProject == "") {
+            print "Column header squid_project missing from " tsvFile > "/dev/stderr"
+            exitWithError = 1
+        }
+
+        if (collaboratorId == "") {
+            print "Column header collaborator_id missing from " tsvFile > "/dev/stderr"
+            exitWithError = 1
+        }
+
+        if (bamFile == "") {
+            print "Column header *bam_file* missing from " tsvFile > "/dev/stderr"
+            exitWithError = 1
+        }
+
+        if (exitWithError) {
+            exit 1
+        }
+
+
+        refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"
+        dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/"
+
+        # add hg18 specific files to awk associative arrays
+        genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
+        evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
+        refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt"
+
+        # add hg19 specific files to awk associative arrays
+        genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf"
+        evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod"
+        refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt"
+
+        printf "{"
+    } else {
+        if (NR == 2) {
+            # Based on the reference of the first sample, specify the dbsnps and refseq tables.
+
+            referencePartCount = split($referenceFile, referenceParts, "/")
+            referenceName = referenceParts[referencePartCount]
+
+            genotypeDbsnp = genotypeDbsnps[referenceName]
+            evalDbsnp = evalDbsnps[referenceName]
+            refseq = refseqs[referenceName]
+
+            printf '"$PROJECT_YAML_TEMPLATE"'
+            printf "\n  samples: ["
+        } else {
+            printf ","
+        }
+        printf '"$SAMPLE_YAML_TEMPLATE"'
+    }
+}
+END {
+    if (NR > 0)
+        printf "\n  ]"
+    print "\n}"
+}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE"
diff --git a/shell/getFirehosePipelineYaml.sh b/shell/getFirehosePipelineYaml.sh
deleted file mode 100755
index 8f03b1e4b..000000000
--- a/shell/getFirehosePipelineYaml.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/bin/sh
-
-# Downloads a set of samples from Firehose using the Firehose Test Harness and awk to generate a YAML file.
-
-ENTITY_SET_ID=$1
-ENTITY_SET_TYPE=Sample_Set
-ENTITY_TYPE=Sample
-
-if [ "$ENTITY_SET_ID" == "" ]; then
-    echo "Usage: $0 <Sample_Set_Name>" >&2
-    exit 1
-fi
-
-# Firehose variables
-
-FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source
-CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis
-FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py"
-FIREHOSE_HOST=firehose
-FIREHOSE_PORT=8080
-FIREHOSE_DOMAIN=gsa
-FIREHOSE_WORKSPACE=trunk
-
-# YAML file to write
-
-PIPELINE_YAML_FILE=$ENTITY_SET_ID.yaml
-
-# Annotations to pull down from Firehose
-
-FIREHOSE_ANNOTATIONS=(reference_file interval_list \
-  sample_id recalibrated_bam_file squid_project collaborator_id)
-
-# YAML templates
-
-# Project YAML template, once per file.
-PROJECT_YAML_TEMPLATE='"\n\
-  project: {\n\
-    name: '"$ENTITY_SET_ID"',\n\
-    referenceFile: %s,\n\
-    genotypeDbsnpFile: %s,\n\
-    evalDbsnpFile: %s,\n\
-    refseqTable: %s,\n\
-    intervalList: %s\n\
-  },", $1, genotypeDbsnp, evalDbsnp, refseq, $2'
-
-# Project YAML template, once per sample.
-SAMPLE_YAML_TEMPLATE='"\n\
-    {\n\
-      id: %s,\n\
-      bamFiles: { cleaned: %s },\n\
-      tags: {\n\
-        SQUIDProject: %s,\n\
-        CollaboratorID: %s\n\
-      }\n\
-    }", $3, $4, $5, $6'
-
-TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
-if [ "$TEST_AWK_COUNT" -eq 2 ]; then
-    # Strip the extra \n from the lines if awk of \n is
-    # a newline and not the two characters slash-n (on mac)
-    PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
-    SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
-fi
-
-index=0
-count=${#FIREHOSE_ANNOTATIONS[@]}
-FIREHOSE_VARIABLES=""
-TAB='	'
-
-# Build the tab separated list of firehose arguments
-
-while [ "$index" -lt "$count" ]; do
-    if [ "$FIREHOSE_VARIABLES" != "" ]; then
-        FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB
-    fi
-    FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}'
-    let "index = $index + 1"
-done
-
-# Retrieve all the required variables and run the pipeline in Queue.
-$FIREHOSE_TEST_HARNESS \
-    -d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \
-    -t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \
-    "echo '$FIREHOSE_VARIABLES'" && \
-\
-# Generate yaml from firehose output
-. firehose-populated-commands.sh | awk '
-BEGIN {
-    refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
-    dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";
-
-    # add hg18 specific files to awk associative arrays
-    genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
-    evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
-    refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";
-
-    # add hg19 specific files to awk associative arrays
-    genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
-    evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
-    refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";
-
-    printf "{"
-}
-{
-    if (NR == 1) {
-        # Based on the reference of the first sample, specify the dbsnps and refseq tables.
-
-        reference_part_count = split($1, reference_parts, "/")
-        reference_name = reference_parts[reference_part_count];
-
-        genotypeDbsnp = genotypeDbsnps[reference_name];
-        evalDbsnp = evalDbsnps[reference_name];
-        refseq = refseqs[reference_name];
-
-        printf '"$PROJECT_YAML_TEMPLATE"'
-        printf "\n  samples: ["
-    } else {
-        printf ","
-    }
-    printf '"$SAMPLE_YAML_TEMPLATE"'
-}
-END {
-    if (NR > 0)
-        printf "\n  ]"
-    print "\n}"
-}' > $PIPELINE_YAML_FILE
-
-#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
-
-# NOTE: DBSNP's are populated via AWK's BEGIN block above.
-#if [ "$hg19" -ne 0 ]; then
-#    sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
-#    mv yaml2 $PIPELINE_YAML_FILE
-#fi
-
-# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
-#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
-#mv yaml2 $PIPELINE_YAML_FILE