From 24ef2be02d0bb35a207e5e80855ff8a51055a4eb Mon Sep 17 00:00:00 2001 From: kshakir Date: Fri, 25 Feb 2011 00:23:05 +0000 Subject: [PATCH] Updated firehose pulldown shell scripts: - a LOT more error reporting to stderr and exit codes - split the firehose pull down into a TSV generators and a TSV to YAML converter - YAML converter is compatible with the TSVs generated by the front end website and will grab only the appropriate columns - deprecated getFirehosePipelineYaml.sh mode with a single Sample_Set name which uses the Firehose test harness - new getFirehosePipelineYamle.sh mode using web services API and requires an additional parameter, a password config file with "-u :" which has been tested on problematic Sample_Sets git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5313 348d0f76-0448-11de-a6fe-93d51630548a --- shell/firehose/getFirehoseCurlTsv.sh | 61 +++++++++ shell/firehose/getFirehosePipelineYaml.sh | 10 ++ shell/firehose/getFirehoseTestTsv.sh | 68 ++++++++++ shell/firehose/pipelineTsvToYaml.sh | 156 ++++++++++++++++++++++ shell/getFirehosePipelineYaml.sh | 138 ------------------- 5 files changed, 295 insertions(+), 138 deletions(-) create mode 100755 shell/firehose/getFirehoseCurlTsv.sh create mode 100755 shell/firehose/getFirehosePipelineYaml.sh create mode 100755 shell/firehose/getFirehoseTestTsv.sh create mode 100755 shell/firehose/pipelineTsvToYaml.sh delete mode 100755 shell/getFirehosePipelineYaml.sh diff --git a/shell/firehose/getFirehoseCurlTsv.sh b/shell/firehose/getFirehoseCurlTsv.sh new file mode 100755 index 000000000..bd0acec26 --- /dev/null +++ b/shell/firehose/getFirehoseCurlTsv.sh @@ -0,0 +1,61 @@ +#!/bin/sh + +# Downloads a set of samples from Firehose using the Firehose API and generate a TSV file with the outputs. +# see: http://iwww.broadinstitute.org/cancer/cga/wiki/index.php/GetAnnotations + +ENTITY_SET_ID=$1 +ENTITY_SET_TYPE=Sample_Set +ENTITY_TYPE=Sample +PASSWORD_FILE=$2 + +if [ "$ENTITY_SET_ID" == "" ]; then + EXIT_USAGE=1 +fi + +if [ "$PASSWORD_FILE" == "" ]; then + echo 'Missing password file with the contents: "-u :"' >&2 + EXIT_USAGE=1 +fi + +if [ $EXIT_USAGE ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# Firehose variables + +FIREHOSE_HOST=firehose +FIREHOSE_PORT=8080 +FIREHOSE_DOMAIN=gsa +FIREHOSE_WORKSPACE=trunk + +# TSV file to write + +PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv + +# Annotations to pull down from Firehose + +FIREHOSE_ANNOTATIONS=(reference_file interval_list recalibrated_bam_file squid_project collaborator_id) + +index=0 +count=${#FIREHOSE_ANNOTATIONS[@]} +FIREHOSE_VARIABLES="" + +# Build the tab separated list of firehose arguments + +while [ "$index" -lt "$count" ]; do + FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'&annotationTypes='${FIREHOSE_ANNOTATIONS[$index]} + let "index = $index + 1" +done + +curl --fail -sL -K "$PASSWORD_FILE" -o "$PIPELINE_TSV_FILE" \ + "http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN/ws/entity/getAnnotations/$ENTITY_TYPE?entityNames=$ENTITY_SET_ID&filterSetType=$ENTITY_SET_TYPE&workspaceName=$FIREHOSE_WORKSPACE$FIREHOSE_VARIABLES" || \ + +EXIT_CODE=$? + +if [[ $EXIT_CODE -ne 0 ]]; then + echo "curl failed with exit code:" $EXIT_CODE >&2 + echo 'Check the name of your Sample_Set and that your password file '$PASSWORD_FILE' is setup correctly with: "-u :"' >&2 + echo "If that doesn't work make sure you can login to the firehose website: http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN" >&2 + exit $EXIT_CODE +fi diff --git a/shell/firehose/getFirehosePipelineYaml.sh b/shell/firehose/getFirehosePipelineYaml.sh new file mode 100755 index 000000000..604affcd3 --- /dev/null +++ b/shell/firehose/getFirehosePipelineYaml.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +# Downloads a set of samples from Firehose and generates a YAML file. + +DIR=`dirname $0` +if [ "$2" == "" ]; then + $DIR/getFirehoseTestTsv.sh $1 && $DIR/pipelineTsvToYaml.sh $1.tsv +else + $DIR/getFirehoseCurlTsv.sh $1 $2 && $DIR/pipelineTsvToYaml.sh $1.tsv +fi diff --git a/shell/firehose/getFirehoseTestTsv.sh b/shell/firehose/getFirehoseTestTsv.sh new file mode 100755 index 000000000..1fdb4335a --- /dev/null +++ b/shell/firehose/getFirehoseTestTsv.sh @@ -0,0 +1,68 @@ +#!/bin/sh + +# Downloads a set of samples from Firehose using the obsolete Firehose Test Harness and generate a TSV file with the outputs. + +ENTITY_SET_ID=$1 +ENTITY_SET_TYPE=Sample_Set +ENTITY_TYPE=Sample + +if [ "$ENTITY_SET_ID" == "" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# Firehose variables + +FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source +CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis +FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py" +FIREHOSE_HOST=firehose +FIREHOSE_PORT=8080 +FIREHOSE_DOMAIN=gsa +FIREHOSE_WORKSPACE=trunk + +# TSV file to write + +PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv + +# Annotations to pull down from Firehose + +FIREHOSE_ANNOTATIONS=(reference_file interval_list sample_id recalibrated_bam_file squid_project collaborator_id) + +index=0 +count=${#FIREHOSE_ANNOTATIONS[@]} +TSV_HEADER="" +FIREHOSE_VARIABLES="" +TAB=' ' + +# Build the tab separated list of firehose arguments + +while [ "$index" -lt "$count" ]; do + if [ "$FIREHOSE_VARIABLES" != "" ]; then + FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB + TSV_HEADER=$TSV_HEADER$TAB + fi + FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}' + TSV_HEADER=$TSV_HEADER${FIREHOSE_ANNOTATIONS[$index]} + let "index = $index + 1" +done + +# Retrieve all the required variables via the test harness. +$FIREHOSE_TEST_HARNESS \ + -d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \ + -t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \ + "echo '$FIREHOSE_VARIABLES'" && \ +\ +# Generate tsv header +echo "$TSV_HEADER" > $PIPELINE_TSV_FILE \ +# Generate tsv from firehose output +. firehose-populated-commands.sh >> $PIPELINE_TSV_FILE + +EXIT_CODE=$? + +if [[ $EXIT_CODE -ne 0 ]]; then + echo "" >&2 + echo "The Firehose test harness failed with exit code:" $EXIT_CODE >&2 + echo 'Check the name of your Sample_Set or try using the newer getFirehoseCurlTsv.sh' >&2 + exit $EXIT_CODE +fi diff --git a/shell/firehose/pipelineTsvToYaml.sh b/shell/firehose/pipelineTsvToYaml.sh new file mode 100755 index 000000000..8582ec822 --- /dev/null +++ b/shell/firehose/pipelineTsvToYaml.sh @@ -0,0 +1,156 @@ +#!/bin/sh + +# Uses awk to generate a YAML file from a TSV. + +# In the awk script and templates: +# - Variables starting with a '$' are columns in the TSV +# - Variables without a '$' are pre-calculated from the first row of data + + +# TSV file to read +PIPELINE_TSV_FILE=$1 + +if [ "$PIPELINE_TSV_FILE" == "" ]; then + echo "Usage: $0 .tsv" >&2 + exit 1 +fi + +ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`) +if [[ ${ROW_COUNT[0]} -lt 2 ]]; then + echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2 + exit 1 +fi + +# YAML file to write +PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml + +# YAML templates + +# Project YAML template, once per file. +PROJECT_YAML_TEMPLATE='"\n\ + project: {\n\ + name: %s,\n\ + referenceFile: %s,\n\ + genotypeDbsnpFile: %s,\n\ + evalDbsnpFile: %s,\n\ + refseqTable: %s,\n\ + intervalList: %s\n\ + },", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList' + +# Project YAML template, once per sample. +SAMPLE_YAML_TEMPLATE='"\n\ + {\n\ + id: %s,\n\ + bamFiles: { cleaned: %s },\n\ + tags: {\n\ + SQUIDProject: %s,\n\ + CollaboratorID: %s\n\ + }\n\ + }", $sampleId, $bamFile, $squidProject, $collaboratorId' + +TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c` +if [ "$TEST_AWK_COUNT" -eq 2 ]; then + # Strip the extra \n from the lines if awk of \n is + # a newline and not the two characters slash-n (on mac) + PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}" + SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}" +fi + +# Generate yaml from tsv +awk ' +{ + if (NR == 1) { + tsvFile = "'$PIPELINE_TSV_FILE'" + + # Set the project name to the TSV file minus the directory and the .tsv + projectName = tsvFile + sub(/\/.*\//, "", projectName) + sub(/\.tsv/, "", projectName) + + # Read the column headers and figure out the index of each column name. + for (i=1; i<=NF; i++) + columnFields[tolower($i)] = i + + referenceFile = columnFields["reference_file"] + intervalList = columnFields["interval_list"] + sampleId = columnFields["sample_id"] + squidProject = columnFields["squid_project"] + collaboratorId = columnFields["collaborator_id"] + + for (key in columnFields) + if (key ~ "bam_file") + bamFile = columnFields[key] + + if (referenceFile == "") { + print "Column header reference_file missing from " tsvFile > "/dev/stderr" + exitWithError = 1 + } + + if (intervalList == "") { + print "Column header interval_list missing from " tsvFile > "/dev/stderr" + exitWithError = 1 + } + + if (sampleId == "") { + print "Column header sample_id missing from " tsvFile > "/dev/stderr" + exitWithError = 1 + } + + if (squidProject == "") { + print "Column header squid_project missing from " tsvFile > "/dev/stderr" + exitWithError = 1 + } + + if (collaboratorId == "") { + print "Column header collaborator_id missing from " tsvFile > "/dev/stderr" + exitWithError = 1 + } + + if (bamFile == "") { + print "Column header *bam_file* missing from " tsvFile > "/dev/stderr" + exitWithError = 1 + } + + if (exitWithError) { + exit 1 + } + + + refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/" + dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/" + + # add hg18 specific files to awk associative arrays + genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod" + evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod" + refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt" + + # add hg19 specific files to awk associative arrays + genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf" + evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod" + refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt" + + printf "{" + } else { + if (NR == 2) { + # Based on the reference of the first sample, specify the dbsnps and refseq tables. + + referencePartCount = split($referenceFile, referenceParts, "/") + referenceName = referenceParts[referencePartCount] + + genotypeDbsnp = genotypeDbsnps[referenceName] + evalDbsnp = evalDbsnps[referenceName] + refseq = refseqs[referenceName] + + printf '"$PROJECT_YAML_TEMPLATE"' + printf "\n samples: [" + } else { + printf "," + } + printf '"$SAMPLE_YAML_TEMPLATE"' + } +} +END { + if (NR > 0) + printf "\n ]" + print "\n}" +}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE" diff --git a/shell/getFirehosePipelineYaml.sh b/shell/getFirehosePipelineYaml.sh deleted file mode 100755 index 8f03b1e4b..000000000 --- a/shell/getFirehosePipelineYaml.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/bin/sh - -# Downloads a set of samples from Firehose using the Firehose Test Harness and awk to generate a YAML file. - -ENTITY_SET_ID=$1 -ENTITY_SET_TYPE=Sample_Set -ENTITY_TYPE=Sample - -if [ "$ENTITY_SET_ID" == "" ]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -# Firehose variables - -FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source -CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis -FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py" -FIREHOSE_HOST=firehose -FIREHOSE_PORT=8080 -FIREHOSE_DOMAIN=gsa -FIREHOSE_WORKSPACE=trunk - -# YAML file to write - -PIPELINE_YAML_FILE=$ENTITY_SET_ID.yaml - -# Annotations to pull down from Firehose - -FIREHOSE_ANNOTATIONS=(reference_file interval_list \ - sample_id recalibrated_bam_file squid_project collaborator_id) - -# YAML templates - -# Project YAML template, once per file. -PROJECT_YAML_TEMPLATE='"\n\ - project: {\n\ - name: '"$ENTITY_SET_ID"',\n\ - referenceFile: %s,\n\ - genotypeDbsnpFile: %s,\n\ - evalDbsnpFile: %s,\n\ - refseqTable: %s,\n\ - intervalList: %s\n\ - },", $1, genotypeDbsnp, evalDbsnp, refseq, $2' - -# Project YAML template, once per sample. -SAMPLE_YAML_TEMPLATE='"\n\ - {\n\ - id: %s,\n\ - bamFiles: { cleaned: %s },\n\ - tags: {\n\ - SQUIDProject: %s,\n\ - CollaboratorID: %s\n\ - }\n\ - }", $3, $4, $5, $6' - -TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c` -if [ "$TEST_AWK_COUNT" -eq 2 ]; then - # Strip the extra \n from the lines if awk of \n is - # a newline and not the two characters slash-n (on mac) - PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}" - SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}" -fi - -index=0 -count=${#FIREHOSE_ANNOTATIONS[@]} -FIREHOSE_VARIABLES="" -TAB=' ' - -# Build the tab separated list of firehose arguments - -while [ "$index" -lt "$count" ]; do - if [ "$FIREHOSE_VARIABLES" != "" ]; then - FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB - fi - FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}' - let "index = $index + 1" -done - -# Retrieve all the required variables and run the pipeline in Queue. -$FIREHOSE_TEST_HARNESS \ - -d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \ - -t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \ - "echo '$FIREHOSE_VARIABLES'" && \ -\ -# Generate yaml from firehose output -. firehose-populated-commands.sh | awk ' -BEGIN { - refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"; - dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/"; - - # add hg18 specific files to awk associative arrays - genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; - evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; - refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt"; - - # add hg19 specific files to awk associative arrays - genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf"; - evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod"; - refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt"; - - printf "{" -} -{ - if (NR == 1) { - # Based on the reference of the first sample, specify the dbsnps and refseq tables. - - reference_part_count = split($1, reference_parts, "/") - reference_name = reference_parts[reference_part_count]; - - genotypeDbsnp = genotypeDbsnps[reference_name]; - evalDbsnp = evalDbsnps[reference_name]; - refseq = refseqs[reference_name]; - - printf '"$PROJECT_YAML_TEMPLATE"' - printf "\n samples: [" - } else { - printf "," - } - printf '"$SAMPLE_YAML_TEMPLATE"' -} -END { - if (NR > 0) - printf "\n ]" - print "\n}" -}' > $PIPELINE_YAML_FILE - -#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE` - -# NOTE: DBSNP's are populated via AWK's BEGIN block above. -#if [ "$hg19" -ne 0 ]; then -# sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 -# mv yaml2 $PIPELINE_YAML_FILE -#fi - -# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above. -#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2 -#mv yaml2 $PIPELINE_YAML_FILE