Updated firehose pulldown shell scripts:

- a LOT more error reporting to stderr and exit codes
- split the firehose pull down into a TSV generators and a TSV to YAML converter
- YAML converter is compatible with the TSVs generated by the front end website and will grab only the appropriate columns
- deprecated getFirehosePipelineYaml.sh mode with a single Sample_Set name which uses the Firehose test harness
- new getFirehosePipelineYamle.sh mode using web services API and requires an additional parameter, a password config file with "-u <user>:<pass>" which has been tested on problematic Sample_Sets



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5313 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-02-25 00:23:05 +00:00
parent cba88a8861
commit 24ef2be02d
5 changed files with 295 additions and 138 deletions

View File

@ -0,0 +1,61 @@
#!/bin/sh
# Downloads a set of samples from Firehose using the Firehose API and generate a TSV file with the outputs.
# see: http://iwww.broadinstitute.org/cancer/cga/wiki/index.php/GetAnnotations
ENTITY_SET_ID=$1
ENTITY_SET_TYPE=Sample_Set
ENTITY_TYPE=Sample
PASSWORD_FILE=$2
if [ "$ENTITY_SET_ID" == "" ]; then
EXIT_USAGE=1
fi
if [ "$PASSWORD_FILE" == "" ]; then
echo 'Missing password file with the contents: "-u <user>:<pass>"' >&2
EXIT_USAGE=1
fi
if [ $EXIT_USAGE ]; then
echo "Usage: $0 <Sample_Set_Name> <Curl_Password_File>" >&2
exit 1
fi
# Firehose variables
FIREHOSE_HOST=firehose
FIREHOSE_PORT=8080
FIREHOSE_DOMAIN=gsa
FIREHOSE_WORKSPACE=trunk
# TSV file to write
PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv
# Annotations to pull down from Firehose
FIREHOSE_ANNOTATIONS=(reference_file interval_list recalibrated_bam_file squid_project collaborator_id)
index=0
count=${#FIREHOSE_ANNOTATIONS[@]}
FIREHOSE_VARIABLES=""
# Build the tab separated list of firehose arguments
while [ "$index" -lt "$count" ]; do
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'&annotationTypes='${FIREHOSE_ANNOTATIONS[$index]}
let "index = $index + 1"
done
curl --fail -sL -K "$PASSWORD_FILE" -o "$PIPELINE_TSV_FILE" \
"http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN/ws/entity/getAnnotations/$ENTITY_TYPE?entityNames=$ENTITY_SET_ID&filterSetType=$ENTITY_SET_TYPE&workspaceName=$FIREHOSE_WORKSPACE$FIREHOSE_VARIABLES" || \
EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]; then
echo "curl failed with exit code:" $EXIT_CODE >&2
echo 'Check the name of your Sample_Set and that your password file '$PASSWORD_FILE' is setup correctly with: "-u <user>:<pass>"' >&2
echo "If that doesn't work make sure you can login to the firehose website: http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN" >&2
exit $EXIT_CODE
fi

View File

@ -0,0 +1,10 @@
#!/bin/sh
# Downloads a set of samples from Firehose and generates a YAML file.
DIR=`dirname $0`
if [ "$2" == "" ]; then
$DIR/getFirehoseTestTsv.sh $1 && $DIR/pipelineTsvToYaml.sh $1.tsv
else
$DIR/getFirehoseCurlTsv.sh $1 $2 && $DIR/pipelineTsvToYaml.sh $1.tsv
fi

View File

@ -0,0 +1,68 @@
#!/bin/sh
# Downloads a set of samples from Firehose using the obsolete Firehose Test Harness and generate a TSV file with the outputs.
ENTITY_SET_ID=$1
ENTITY_SET_TYPE=Sample_Set
ENTITY_TYPE=Sample
if [ "$ENTITY_SET_ID" == "" ]; then
echo "Usage: $0 <Sample_Set_Name>" >&2
exit 1
fi
# Firehose variables
FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source
CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis
FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py"
FIREHOSE_HOST=firehose
FIREHOSE_PORT=8080
FIREHOSE_DOMAIN=gsa
FIREHOSE_WORKSPACE=trunk
# TSV file to write
PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv
# Annotations to pull down from Firehose
FIREHOSE_ANNOTATIONS=(reference_file interval_list sample_id recalibrated_bam_file squid_project collaborator_id)
index=0
count=${#FIREHOSE_ANNOTATIONS[@]}
TSV_HEADER=""
FIREHOSE_VARIABLES=""
TAB=' '
# Build the tab separated list of firehose arguments
while [ "$index" -lt "$count" ]; do
if [ "$FIREHOSE_VARIABLES" != "" ]; then
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB
TSV_HEADER=$TSV_HEADER$TAB
fi
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}'
TSV_HEADER=$TSV_HEADER${FIREHOSE_ANNOTATIONS[$index]}
let "index = $index + 1"
done
# Retrieve all the required variables via the test harness.
$FIREHOSE_TEST_HARNESS \
-d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \
-t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \
"echo '$FIREHOSE_VARIABLES'" && \
\
# Generate tsv header
echo "$TSV_HEADER" > $PIPELINE_TSV_FILE \
# Generate tsv from firehose output
. firehose-populated-commands.sh >> $PIPELINE_TSV_FILE
EXIT_CODE=$?
if [[ $EXIT_CODE -ne 0 ]]; then
echo "" >&2
echo "The Firehose test harness failed with exit code:" $EXIT_CODE >&2
echo 'Check the name of your Sample_Set or try using the newer getFirehoseCurlTsv.sh' >&2
exit $EXIT_CODE
fi

View File

@ -0,0 +1,156 @@
#!/bin/sh
# Uses awk to generate a YAML file from a TSV.
# In the awk script and templates:
# - Variables starting with a '$' are columns in the TSV
# - Variables without a '$' are pre-calculated from the first row of data
# TSV file to read
PIPELINE_TSV_FILE=$1
if [ "$PIPELINE_TSV_FILE" == "" ]; then
echo "Usage: $0 <Set_Name>.tsv" >&2
exit 1
fi
ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`)
if [[ ${ROW_COUNT[0]} -lt 2 ]]; then
echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2
exit 1
fi
# YAML file to write
PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml
# YAML templates
# Project YAML template, once per file.
PROJECT_YAML_TEMPLATE='"\n\
project: {\n\
name: %s,\n\
referenceFile: %s,\n\
genotypeDbsnpFile: %s,\n\
evalDbsnpFile: %s,\n\
refseqTable: %s,\n\
intervalList: %s\n\
},", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList'
# Project YAML template, once per sample.
SAMPLE_YAML_TEMPLATE='"\n\
{\n\
id: %s,\n\
bamFiles: { cleaned: %s },\n\
tags: {\n\
SQUIDProject: %s,\n\
CollaboratorID: %s\n\
}\n\
}", $sampleId, $bamFile, $squidProject, $collaboratorId'
TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
# Strip the extra \n from the lines if awk of \n is
# a newline and not the two characters slash-n (on mac)
PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
fi
# Generate yaml from tsv
awk '
{
if (NR == 1) {
tsvFile = "'$PIPELINE_TSV_FILE'"
# Set the project name to the TSV file minus the directory and the .tsv
projectName = tsvFile
sub(/\/.*\//, "", projectName)
sub(/\.tsv/, "", projectName)
# Read the column headers and figure out the index of each column name.
for (i=1; i<=NF; i++)
columnFields[tolower($i)] = i
referenceFile = columnFields["reference_file"]
intervalList = columnFields["interval_list"]
sampleId = columnFields["sample_id"]
squidProject = columnFields["squid_project"]
collaboratorId = columnFields["collaborator_id"]
for (key in columnFields)
if (key ~ "bam_file")
bamFile = columnFields[key]
if (referenceFile == "") {
print "Column header reference_file missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (intervalList == "") {
print "Column header interval_list missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (sampleId == "") {
print "Column header sample_id missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (squidProject == "") {
print "Column header squid_project missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (collaboratorId == "") {
print "Column header collaborator_id missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (bamFile == "") {
print "Column header *bam_file* missing from " tsvFile > "/dev/stderr"
exitWithError = 1
}
if (exitWithError) {
exit 1
}
refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"
dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/"
# add hg18 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt"
# add hg19 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf"
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod"
refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt"
printf "{"
} else {
if (NR == 2) {
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
referencePartCount = split($referenceFile, referenceParts, "/")
referenceName = referenceParts[referencePartCount]
genotypeDbsnp = genotypeDbsnps[referenceName]
evalDbsnp = evalDbsnps[referenceName]
refseq = refseqs[referenceName]
printf '"$PROJECT_YAML_TEMPLATE"'
printf "\n samples: ["
} else {
printf ","
}
printf '"$SAMPLE_YAML_TEMPLATE"'
}
}
END {
if (NR > 0)
printf "\n ]"
print "\n}"
}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE"

View File

@ -1,138 +0,0 @@
#!/bin/sh
# Downloads a set of samples from Firehose using the Firehose Test Harness and awk to generate a YAML file.
ENTITY_SET_ID=$1
ENTITY_SET_TYPE=Sample_Set
ENTITY_TYPE=Sample
if [ "$ENTITY_SET_ID" == "" ]; then
echo "Usage: $0 <Sample_Set_Name>" >&2
exit 1
fi
# Firehose variables
FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source
CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis
FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py"
FIREHOSE_HOST=firehose
FIREHOSE_PORT=8080
FIREHOSE_DOMAIN=gsa
FIREHOSE_WORKSPACE=trunk
# YAML file to write
PIPELINE_YAML_FILE=$ENTITY_SET_ID.yaml
# Annotations to pull down from Firehose
FIREHOSE_ANNOTATIONS=(reference_file interval_list \
sample_id recalibrated_bam_file squid_project collaborator_id)
# YAML templates
# Project YAML template, once per file.
PROJECT_YAML_TEMPLATE='"\n\
project: {\n\
name: '"$ENTITY_SET_ID"',\n\
referenceFile: %s,\n\
genotypeDbsnpFile: %s,\n\
evalDbsnpFile: %s,\n\
refseqTable: %s,\n\
intervalList: %s\n\
},", $1, genotypeDbsnp, evalDbsnp, refseq, $2'
# Project YAML template, once per sample.
SAMPLE_YAML_TEMPLATE='"\n\
{\n\
id: %s,\n\
bamFiles: { cleaned: %s },\n\
tags: {\n\
SQUIDProject: %s,\n\
CollaboratorID: %s\n\
}\n\
}", $3, $4, $5, $6'
TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
# Strip the extra \n from the lines if awk of \n is
# a newline and not the two characters slash-n (on mac)
PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
fi
index=0
count=${#FIREHOSE_ANNOTATIONS[@]}
FIREHOSE_VARIABLES=""
TAB=' '
# Build the tab separated list of firehose arguments
while [ "$index" -lt "$count" ]; do
if [ "$FIREHOSE_VARIABLES" != "" ]; then
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB
fi
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}'
let "index = $index + 1"
done
# Retrieve all the required variables and run the pipeline in Queue.
$FIREHOSE_TEST_HARNESS \
-d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \
-t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \
"echo '$FIREHOSE_VARIABLES'" && \
\
# Generate yaml from firehose output
. firehose-populated-commands.sh | awk '
BEGIN {
refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";
# add hg18 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";
# add hg19 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";
printf "{"
}
{
if (NR == 1) {
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
reference_part_count = split($1, reference_parts, "/")
reference_name = reference_parts[reference_part_count];
genotypeDbsnp = genotypeDbsnps[reference_name];
evalDbsnp = evalDbsnps[reference_name];
refseq = refseqs[reference_name];
printf '"$PROJECT_YAML_TEMPLATE"'
printf "\n samples: ["
} else {
printf ","
}
printf '"$SAMPLE_YAML_TEMPLATE"'
}
END {
if (NR > 0)
printf "\n ]"
print "\n}"
}' > $PIPELINE_YAML_FILE
#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
# NOTE: DBSNP's are populated via AWK's BEGIN block above.
#if [ "$hg19" -ne 0 ]; then
# sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
# mv yaml2 $PIPELINE_YAML_FILE
#fi
# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
#mv yaml2 $PIPELINE_YAML_FILE