Updated firehose pulldown shell scripts:
- a LOT more error reporting to stderr and exit codes - split the firehose pull down into a TSV generators and a TSV to YAML converter - YAML converter is compatible with the TSVs generated by the front end website and will grab only the appropriate columns - deprecated getFirehosePipelineYaml.sh mode with a single Sample_Set name which uses the Firehose test harness - new getFirehosePipelineYamle.sh mode using web services API and requires an additional parameter, a password config file with "-u <user>:<pass>" which has been tested on problematic Sample_Sets git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5313 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cba88a8861
commit
24ef2be02d
|
|
@ -0,0 +1,61 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Downloads a set of samples from Firehose using the Firehose API and generate a TSV file with the outputs.
|
||||
# see: http://iwww.broadinstitute.org/cancer/cga/wiki/index.php/GetAnnotations
|
||||
|
||||
ENTITY_SET_ID=$1
|
||||
ENTITY_SET_TYPE=Sample_Set
|
||||
ENTITY_TYPE=Sample
|
||||
PASSWORD_FILE=$2
|
||||
|
||||
if [ "$ENTITY_SET_ID" == "" ]; then
|
||||
EXIT_USAGE=1
|
||||
fi
|
||||
|
||||
if [ "$PASSWORD_FILE" == "" ]; then
|
||||
echo 'Missing password file with the contents: "-u <user>:<pass>"' >&2
|
||||
EXIT_USAGE=1
|
||||
fi
|
||||
|
||||
if [ $EXIT_USAGE ]; then
|
||||
echo "Usage: $0 <Sample_Set_Name> <Curl_Password_File>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Firehose variables
|
||||
|
||||
FIREHOSE_HOST=firehose
|
||||
FIREHOSE_PORT=8080
|
||||
FIREHOSE_DOMAIN=gsa
|
||||
FIREHOSE_WORKSPACE=trunk
|
||||
|
||||
# TSV file to write
|
||||
|
||||
PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv
|
||||
|
||||
# Annotations to pull down from Firehose
|
||||
|
||||
FIREHOSE_ANNOTATIONS=(reference_file interval_list recalibrated_bam_file squid_project collaborator_id)
|
||||
|
||||
index=0
|
||||
count=${#FIREHOSE_ANNOTATIONS[@]}
|
||||
FIREHOSE_VARIABLES=""
|
||||
|
||||
# Build the tab separated list of firehose arguments
|
||||
|
||||
while [ "$index" -lt "$count" ]; do
|
||||
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'&annotationTypes='${FIREHOSE_ANNOTATIONS[$index]}
|
||||
let "index = $index + 1"
|
||||
done
|
||||
|
||||
curl --fail -sL -K "$PASSWORD_FILE" -o "$PIPELINE_TSV_FILE" \
|
||||
"http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN/ws/entity/getAnnotations/$ENTITY_TYPE?entityNames=$ENTITY_SET_ID&filterSetType=$ENTITY_SET_TYPE&workspaceName=$FIREHOSE_WORKSPACE$FIREHOSE_VARIABLES" || \
|
||||
|
||||
EXIT_CODE=$?
|
||||
|
||||
if [[ $EXIT_CODE -ne 0 ]]; then
|
||||
echo "curl failed with exit code:" $EXIT_CODE >&2
|
||||
echo 'Check the name of your Sample_Set and that your password file '$PASSWORD_FILE' is setup correctly with: "-u <user>:<pass>"' >&2
|
||||
echo "If that doesn't work make sure you can login to the firehose website: http://$FIREHOSE_HOST:$FIREHOSE_PORT/$FIREHOSE_DOMAIN" >&2
|
||||
exit $EXIT_CODE
|
||||
fi
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Downloads a set of samples from Firehose and generates a YAML file.
|
||||
|
||||
DIR=`dirname $0`
|
||||
if [ "$2" == "" ]; then
|
||||
$DIR/getFirehoseTestTsv.sh $1 && $DIR/pipelineTsvToYaml.sh $1.tsv
|
||||
else
|
||||
$DIR/getFirehoseCurlTsv.sh $1 $2 && $DIR/pipelineTsvToYaml.sh $1.tsv
|
||||
fi
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Downloads a set of samples from Firehose using the obsolete Firehose Test Harness and generate a TSV file with the outputs.
|
||||
|
||||
ENTITY_SET_ID=$1
|
||||
ENTITY_SET_TYPE=Sample_Set
|
||||
ENTITY_TYPE=Sample
|
||||
|
||||
if [ "$ENTITY_SET_ID" == "" ]; then
|
||||
echo "Usage: $0 <Sample_Set_Name>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Firehose variables
|
||||
|
||||
FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source
|
||||
CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis
|
||||
FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py"
|
||||
FIREHOSE_HOST=firehose
|
||||
FIREHOSE_PORT=8080
|
||||
FIREHOSE_DOMAIN=gsa
|
||||
FIREHOSE_WORKSPACE=trunk
|
||||
|
||||
# TSV file to write
|
||||
|
||||
PIPELINE_TSV_FILE=$ENTITY_SET_ID.tsv
|
||||
|
||||
# Annotations to pull down from Firehose
|
||||
|
||||
FIREHOSE_ANNOTATIONS=(reference_file interval_list sample_id recalibrated_bam_file squid_project collaborator_id)
|
||||
|
||||
index=0
|
||||
count=${#FIREHOSE_ANNOTATIONS[@]}
|
||||
TSV_HEADER=""
|
||||
FIREHOSE_VARIABLES=""
|
||||
TAB=' '
|
||||
|
||||
# Build the tab separated list of firehose arguments
|
||||
|
||||
while [ "$index" -lt "$count" ]; do
|
||||
if [ "$FIREHOSE_VARIABLES" != "" ]; then
|
||||
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB
|
||||
TSV_HEADER=$TSV_HEADER$TAB
|
||||
fi
|
||||
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}'
|
||||
TSV_HEADER=$TSV_HEADER${FIREHOSE_ANNOTATIONS[$index]}
|
||||
let "index = $index + 1"
|
||||
done
|
||||
|
||||
# Retrieve all the required variables via the test harness.
|
||||
$FIREHOSE_TEST_HARNESS \
|
||||
-d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \
|
||||
-t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \
|
||||
"echo '$FIREHOSE_VARIABLES'" && \
|
||||
\
|
||||
# Generate tsv header
|
||||
echo "$TSV_HEADER" > $PIPELINE_TSV_FILE \
|
||||
# Generate tsv from firehose output
|
||||
. firehose-populated-commands.sh >> $PIPELINE_TSV_FILE
|
||||
|
||||
EXIT_CODE=$?
|
||||
|
||||
if [[ $EXIT_CODE -ne 0 ]]; then
|
||||
echo "" >&2
|
||||
echo "The Firehose test harness failed with exit code:" $EXIT_CODE >&2
|
||||
echo 'Check the name of your Sample_Set or try using the newer getFirehoseCurlTsv.sh' >&2
|
||||
exit $EXIT_CODE
|
||||
fi
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Uses awk to generate a YAML file from a TSV.
|
||||
|
||||
# In the awk script and templates:
|
||||
# - Variables starting with a '$' are columns in the TSV
|
||||
# - Variables without a '$' are pre-calculated from the first row of data
|
||||
|
||||
|
||||
# TSV file to read
|
||||
PIPELINE_TSV_FILE=$1
|
||||
|
||||
if [ "$PIPELINE_TSV_FILE" == "" ]; then
|
||||
echo "Usage: $0 <Set_Name>.tsv" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROW_COUNT=(`wc -l $PIPELINE_TSV_FILE`)
|
||||
if [[ ${ROW_COUNT[0]} -lt 2 ]]; then
|
||||
echo "Header plus data not found in tsv: $PIPELINE_TSV_FILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# YAML file to write
|
||||
PIPELINE_YAML_FILE=${PIPELINE_TSV_FILE%.tsv}.yaml
|
||||
|
||||
# YAML templates
|
||||
|
||||
# Project YAML template, once per file.
|
||||
PROJECT_YAML_TEMPLATE='"\n\
|
||||
project: {\n\
|
||||
name: %s,\n\
|
||||
referenceFile: %s,\n\
|
||||
genotypeDbsnpFile: %s,\n\
|
||||
evalDbsnpFile: %s,\n\
|
||||
refseqTable: %s,\n\
|
||||
intervalList: %s\n\
|
||||
},", projectName, $referenceFile, genotypeDbsnp, evalDbsnp, refseq, $intervalList'
|
||||
|
||||
# Project YAML template, once per sample.
|
||||
SAMPLE_YAML_TEMPLATE='"\n\
|
||||
{\n\
|
||||
id: %s,\n\
|
||||
bamFiles: { cleaned: %s },\n\
|
||||
tags: {\n\
|
||||
SQUIDProject: %s,\n\
|
||||
CollaboratorID: %s\n\
|
||||
}\n\
|
||||
}", $sampleId, $bamFile, $squidProject, $collaboratorId'
|
||||
|
||||
TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
|
||||
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
|
||||
# Strip the extra \n from the lines if awk of \n is
|
||||
# a newline and not the two characters slash-n (on mac)
|
||||
PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
|
||||
SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
|
||||
fi
|
||||
|
||||
# Generate yaml from tsv
|
||||
awk '
|
||||
{
|
||||
if (NR == 1) {
|
||||
tsvFile = "'$PIPELINE_TSV_FILE'"
|
||||
|
||||
# Set the project name to the TSV file minus the directory and the .tsv
|
||||
projectName = tsvFile
|
||||
sub(/\/.*\//, "", projectName)
|
||||
sub(/\.tsv/, "", projectName)
|
||||
|
||||
# Read the column headers and figure out the index of each column name.
|
||||
for (i=1; i<=NF; i++)
|
||||
columnFields[tolower($i)] = i
|
||||
|
||||
referenceFile = columnFields["reference_file"]
|
||||
intervalList = columnFields["interval_list"]
|
||||
sampleId = columnFields["sample_id"]
|
||||
squidProject = columnFields["squid_project"]
|
||||
collaboratorId = columnFields["collaborator_id"]
|
||||
|
||||
for (key in columnFields)
|
||||
if (key ~ "bam_file")
|
||||
bamFile = columnFields[key]
|
||||
|
||||
if (referenceFile == "") {
|
||||
print "Column header reference_file missing from " tsvFile > "/dev/stderr"
|
||||
exitWithError = 1
|
||||
}
|
||||
|
||||
if (intervalList == "") {
|
||||
print "Column header interval_list missing from " tsvFile > "/dev/stderr"
|
||||
exitWithError = 1
|
||||
}
|
||||
|
||||
if (sampleId == "") {
|
||||
print "Column header sample_id missing from " tsvFile > "/dev/stderr"
|
||||
exitWithError = 1
|
||||
}
|
||||
|
||||
if (squidProject == "") {
|
||||
print "Column header squid_project missing from " tsvFile > "/dev/stderr"
|
||||
exitWithError = 1
|
||||
}
|
||||
|
||||
if (collaboratorId == "") {
|
||||
print "Column header collaborator_id missing from " tsvFile > "/dev/stderr"
|
||||
exitWithError = 1
|
||||
}
|
||||
|
||||
if (bamFile == "") {
|
||||
print "Column header *bam_file* missing from " tsvFile > "/dev/stderr"
|
||||
exitWithError = 1
|
||||
}
|
||||
|
||||
if (exitWithError) {
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
refseqDir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"
|
||||
dbsnpDir = "/humgen/gsa-hpprojects/GATK/data/"
|
||||
|
||||
# add hg18 specific files to awk associative arrays
|
||||
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
|
||||
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnpDir "dbsnp_129_hg18.rod"
|
||||
refseqs["Homo_sapiens_assembly18.fasta"] = refseqDir "refGene-big-table-hg18.txt"
|
||||
|
||||
# add hg19 specific files to awk associative arrays
|
||||
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_132_b37.vcf"
|
||||
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnpDir "dbsnp_129_b37.rod"
|
||||
refseqs["Homo_sapiens_assembly19.fasta"] = refseqDir "refGene-big-table-hg19.txt"
|
||||
|
||||
printf "{"
|
||||
} else {
|
||||
if (NR == 2) {
|
||||
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
|
||||
|
||||
referencePartCount = split($referenceFile, referenceParts, "/")
|
||||
referenceName = referenceParts[referencePartCount]
|
||||
|
||||
genotypeDbsnp = genotypeDbsnps[referenceName]
|
||||
evalDbsnp = evalDbsnps[referenceName]
|
||||
refseq = refseqs[referenceName]
|
||||
|
||||
printf '"$PROJECT_YAML_TEMPLATE"'
|
||||
printf "\n samples: ["
|
||||
} else {
|
||||
printf ","
|
||||
}
|
||||
printf '"$SAMPLE_YAML_TEMPLATE"'
|
||||
}
|
||||
}
|
||||
END {
|
||||
if (NR > 0)
|
||||
printf "\n ]"
|
||||
print "\n}"
|
||||
}' "$PIPELINE_TSV_FILE" > "$PIPELINE_YAML_FILE"
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Downloads a set of samples from Firehose using the Firehose Test Harness and awk to generate a YAML file.
|
||||
|
||||
ENTITY_SET_ID=$1
|
||||
ENTITY_SET_TYPE=Sample_Set
|
||||
ENTITY_TYPE=Sample
|
||||
|
||||
if [ "$ENTITY_SET_ID" == "" ]; then
|
||||
echo "Usage: $0 <Sample_Set_Name>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Firehose variables
|
||||
|
||||
FIREHOSE_SOURCE_HOME=/humgen/gsa-firehose/firehose/source
|
||||
CGA_HOME=$FIREHOSE_SOURCE_HOME/CancerGenomeAnalysis
|
||||
FIREHOSE_TEST_HARNESS="python $CGA_HOME/analysis_pipeline/scripts/firehose_test_harness.py"
|
||||
FIREHOSE_HOST=firehose
|
||||
FIREHOSE_PORT=8080
|
||||
FIREHOSE_DOMAIN=gsa
|
||||
FIREHOSE_WORKSPACE=trunk
|
||||
|
||||
# YAML file to write
|
||||
|
||||
PIPELINE_YAML_FILE=$ENTITY_SET_ID.yaml
|
||||
|
||||
# Annotations to pull down from Firehose
|
||||
|
||||
FIREHOSE_ANNOTATIONS=(reference_file interval_list \
|
||||
sample_id recalibrated_bam_file squid_project collaborator_id)
|
||||
|
||||
# YAML templates
|
||||
|
||||
# Project YAML template, once per file.
|
||||
PROJECT_YAML_TEMPLATE='"\n\
|
||||
project: {\n\
|
||||
name: '"$ENTITY_SET_ID"',\n\
|
||||
referenceFile: %s,\n\
|
||||
genotypeDbsnpFile: %s,\n\
|
||||
evalDbsnpFile: %s,\n\
|
||||
refseqTable: %s,\n\
|
||||
intervalList: %s\n\
|
||||
},", $1, genotypeDbsnp, evalDbsnp, refseq, $2'
|
||||
|
||||
# Project YAML template, once per sample.
|
||||
SAMPLE_YAML_TEMPLATE='"\n\
|
||||
{\n\
|
||||
id: %s,\n\
|
||||
bamFiles: { cleaned: %s },\n\
|
||||
tags: {\n\
|
||||
SQUIDProject: %s,\n\
|
||||
CollaboratorID: %s\n\
|
||||
}\n\
|
||||
}", $3, $4, $5, $6'
|
||||
|
||||
TEST_AWK_COUNT=`echo '\n' | awk '{print $0}' | wc -c`
|
||||
if [ "$TEST_AWK_COUNT" -eq 2 ]; then
|
||||
# Strip the extra \n from the lines if awk of \n is
|
||||
# a newline and not the two characters slash-n (on mac)
|
||||
PROJECT_YAML_TEMPLATE="${PROJECT_YAML_TEMPLATE//\\\n/}"
|
||||
SAMPLE_YAML_TEMPLATE="${SAMPLE_YAML_TEMPLATE//\\\n/}"
|
||||
fi
|
||||
|
||||
index=0
|
||||
count=${#FIREHOSE_ANNOTATIONS[@]}
|
||||
FIREHOSE_VARIABLES=""
|
||||
TAB=' '
|
||||
|
||||
# Build the tab separated list of firehose arguments
|
||||
|
||||
while [ "$index" -lt "$count" ]; do
|
||||
if [ "$FIREHOSE_VARIABLES" != "" ]; then
|
||||
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES$TAB
|
||||
fi
|
||||
FIREHOSE_VARIABLES=$FIREHOSE_VARIABLES'${'${FIREHOSE_ANNOTATIONS[$index]}'}'
|
||||
let "index = $index + 1"
|
||||
done
|
||||
|
||||
# Retrieve all the required variables and run the pipeline in Queue.
|
||||
$FIREHOSE_TEST_HARNESS \
|
||||
-d $FIREHOSE_DOMAIN -w $FIREHOSE_WORKSPACE \
|
||||
-t $ENTITY_TYPE -f $ENTITY_SET_ID -y $ENTITY_SET_TYPE \
|
||||
"echo '$FIREHOSE_VARIABLES'" && \
|
||||
\
|
||||
# Generate yaml from firehose output
|
||||
. firehose-populated-commands.sh | awk '
|
||||
BEGIN {
|
||||
refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
|
||||
dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";
|
||||
|
||||
# add hg18 specific files to awk associative arrays
|
||||
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
|
||||
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
|
||||
refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";
|
||||
|
||||
# add hg19 specific files to awk associative arrays
|
||||
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
|
||||
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
|
||||
refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";
|
||||
|
||||
printf "{"
|
||||
}
|
||||
{
|
||||
if (NR == 1) {
|
||||
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
|
||||
|
||||
reference_part_count = split($1, reference_parts, "/")
|
||||
reference_name = reference_parts[reference_part_count];
|
||||
|
||||
genotypeDbsnp = genotypeDbsnps[reference_name];
|
||||
evalDbsnp = evalDbsnps[reference_name];
|
||||
refseq = refseqs[reference_name];
|
||||
|
||||
printf '"$PROJECT_YAML_TEMPLATE"'
|
||||
printf "\n samples: ["
|
||||
} else {
|
||||
printf ","
|
||||
}
|
||||
printf '"$SAMPLE_YAML_TEMPLATE"'
|
||||
}
|
||||
END {
|
||||
if (NR > 0)
|
||||
printf "\n ]"
|
||||
print "\n}"
|
||||
}' > $PIPELINE_YAML_FILE
|
||||
|
||||
#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
|
||||
|
||||
# NOTE: DBSNP's are populated via AWK's BEGIN block above.
|
||||
#if [ "$hg19" -ne 0 ]; then
|
||||
# sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
|
||||
# mv yaml2 $PIPELINE_YAML_FILE
|
||||
#fi
|
||||
|
||||
# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
|
||||
#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
|
||||
#mv yaml2 $PIPELINE_YAML_FILE
|
||||
Loading…
Reference in New Issue