Renamed the pipeline yaml dbsnpFile to genotypeDbsnp, and added an evalDbsnp.

Added a genotypeDbsnpType and evalDbsnpType to check the extensions for .vcf or .rod.
Moved renaming of "recalibrated" bams to "cleaned" from sed to yaml generation template (see diff for more info).
Renamed fCP.q to FCP.q.
Though it's still disabled until VariantEval is updated, added changes above to the FCPTest.
Removed refseq table from the queue.sh wrapper script. Only specified in the yaml.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5213 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-02-07 22:01:09 +00:00
parent bceb2a9460
commit 8040998c15
5 changed files with 77 additions and 45 deletions

View File

@ -35,7 +35,8 @@ public class PipelineProject {
private String name; private String name;
private File referenceFile; private File referenceFile;
private File intervalList; private File intervalList;
private File dbsnpFile; private File genotypeDbsnp;
private File evalDbsnp;
private File refseqTable; private File refseqTable;
private Map<String, String> tags = new TreeMap<String, String>(); private Map<String, String> tags = new TreeMap<String, String>();
@ -63,12 +64,40 @@ public class PipelineProject {
this.referenceFile = referenceFile; this.referenceFile = referenceFile;
} }
/** use genotype dbsnp file */
@Deprecated
public File getDbsnpFile() { public File getDbsnpFile() {
return dbsnpFile; return getGenotypeDbsnp();
} }
/** use genotype dbsnp file */
@Deprecated
public void setDbsnpFile(File dbsnpFile) { public void setDbsnpFile(File dbsnpFile) {
this.dbsnpFile = dbsnpFile; this.setGenotypeDbsnp(dbsnpFile);
}
public File getGenotypeDbsnp() {
return genotypeDbsnp;
}
public void setGenotypeDbsnp(File genotypeDbsnp) {
this.genotypeDbsnp = genotypeDbsnp;
}
public String getGenotypeDbsnpType() {
return getDbsnpType(genotypeDbsnp);
}
public File getEvalDbsnp() {
return evalDbsnp;
}
public void setEvalDbsnp(File evalDbsnp) {
this.evalDbsnp = evalDbsnp;
}
public String getEvalDbsnpType() {
return getDbsnpType(genotypeDbsnp);
} }
public File getRefseqTable() { public File getRefseqTable() {
@ -86,4 +115,13 @@ public class PipelineProject {
public void setTags(Map<String, String> tags) { public void setTags(Map<String, String> tags) {
this.tags = tags; this.tags = tags;
} }
private String getDbsnpType(File file) {
if (file == null)
return null;
else if (file.getName().toLowerCase().endsWith(".vcf"))
return "vcf";
else
return "dbsnp";
}
} }

View File

@ -10,7 +10,7 @@ import org.broadinstitute.sting.queue.QScript
import collection.JavaConversions._ import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils import org.broadinstitute.sting.utils.yaml.YamlUtils
class fullCallingPipeline extends QScript { class FullCallingPipeline extends QScript {
qscript => qscript =>
@Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y") @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
@ -64,8 +64,6 @@ class fullCallingPipeline extends QScript {
private var pipeline: Pipeline = _ private var pipeline: Pipeline = _
private var dbsnpType: String = _
trait CommandLineGATKArgs extends CommandLineGATK { trait CommandLineGATKArgs extends CommandLineGATK {
this.intervals = List(qscript.pipeline.getProject.getIntervalList) this.intervals = List(qscript.pipeline.getProject.getIntervalList)
this.jarFile = qscript.gatkJar this.jarFile = qscript.gatkJar
@ -79,12 +77,6 @@ class fullCallingPipeline extends QScript {
def script = { def script = {
pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile) pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
//var dbsnpType: String = _ //figure out how to get it so this is recognized and the whole thing has access to it.
if (qscript.pipeline.getProject.getDbsnpFile.toString.contains("rod")){
dbsnpType = "dbSNP"
} else {
dbsnpType = "VCF"
}
val projectBase: String = qscript.pipeline.getProject.getName val projectBase: String = qscript.pipeline.getProject.getName
// TODO: Fix command lines that pass -refseqTable // TODO: Fix command lines that pass -refseqTable
@ -131,7 +123,7 @@ class fullCallingPipeline extends QScript {
realigner.intervals = Nil realigner.intervals = Nil
realigner.intervalsString = Nil realigner.intervalsString = Nil
realigner.scatterCount = num_cleaner_scatter_jobs realigner.scatterCount = num_cleaner_scatter_jobs
realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) realigner.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf")) realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf"))
// if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates // if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates
@ -214,7 +206,7 @@ class fullCallingPipeline extends QScript {
snps.group :+= "Standard" snps.group :+= "Standard"
snps.out = new File("SnpCalls", base+".vcf") snps.out = new File("SnpCalls", base+".vcf")
snps.downsample_to_coverage = Some(qscript.downsampling_coverage) snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
snps.memoryLimit = Some(6) snps.memoryLimit = Some(6)
snps.scatterCount = qscript.num_snp_scatter_jobs snps.scatterCount = qscript.num_snp_scatter_jobs
@ -242,7 +234,7 @@ class fullCallingPipeline extends QScript {
indels.group :+= "Standard" indels.group :+= "Standard"
indels.out = new File("IndelCalls", base+".vcf") indels.out = new File("IndelCalls", base+".vcf")
indels.downsample_to_coverage = Some(qscript.downsampling_coverage) indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
indels.memoryLimit = Some(6) indels.memoryLimit = Some(6)
indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL) indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
@ -308,12 +300,7 @@ class fullCallingPipeline extends QScript {
eval.reportLocation = new File("SnpCalls", base+".eval") eval.reportLocation = new File("SnpCalls", base+".eval")
eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R) eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
eval.analysisName = base+"_VariantEval" eval.analysisName = base+"_VariantEval"
if(dbsnpType=="VCF"){ eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)
eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
} else{
eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
}
// 5. Make the bam list // 5. Make the bam list
val listOfBams = new File(base +".BamFiles.list") val listOfBams = new File(base +".BamFiles.list")

View File

@ -23,11 +23,6 @@ class FullCallingPipelineTest {
new K1gBam("C474", "NA19652", 1), new K1gBam("C474", "NA19652", 1),
new K1gBam("C474", "NA19654", 1)) new K1gBam("C474", "NA19654", 1))
// In fullCallingPipeline.q VariantEval is always compared against 129.
// Until the newvarianteval is finalized which will allow java import of the prior results,
// we re-run VariantEval to validate the run, and replicate that behavior here.
private final val variantEvalDbsnpFile = new File(BaseTest.b37dbSNP129)
val k1gChr20Dataset = { val k1gChr20Dataset = {
val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20") val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20")
dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list")) dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"))
@ -64,7 +59,8 @@ class FullCallingPipelineTest {
val project = new PipelineProject val project = new PipelineProject
project.setName(projectName) project.setName(projectName)
project.setReferenceFile(new File(BaseTest.hg19Reference)) project.setReferenceFile(new File(BaseTest.hg19Reference))
project.setDbsnpFile(new File(BaseTest.b37dbSNP132)) project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132))
project.setEvalDbsnp(new File(BaseTest.b37dbSNP129))
project.setRefseqTable(new File(BaseTest.hg19Refseq)) project.setRefseqTable(new File(BaseTest.hg19Refseq))
var samples = List.empty[PipelineSample] var samples = List.empty[PipelineSample]
@ -94,12 +90,12 @@ class FullCallingPipelineTest {
@Test(dataProvider="datasets", enabled=false) @Test(dataProvider="datasets", enabled=false)
def testFullCallingPipeline(dataset: PipelineDataset) = { def testFullCallingPipeline(dataset: PipelineDataset) = {
val projectName = dataset.pipeline.getProject.getName val projectName = dataset.pipeline.getProject.getName
val testName = "fullCallingPipeline-" + projectName val testName = "FullCallingPipeline-" + projectName
val yamlFile = writeYaml(testName, dataset.pipeline) val yamlFile = writeYaml(testName, dataset.pipeline)
var cleanType = "cleaned" var cleanType = "cleaned"
// Run the pipeline with the expected inputs. // Run the pipeline with the expected inputs.
var pipelineCommand = ("-retry 1 -S scala/qscript/playground/fullCallingPipeline.q" + var pipelineCommand = ("-retry 1 -S scala/qscript/playground/FullCallingPipeline.q" +
" -jobProject %s -Y %s" + " -jobProject %s -Y %s" +
" -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" + " -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" +
" --gatkjar %s") " --gatkjar %s")
@ -119,7 +115,7 @@ class FullCallingPipelineTest {
pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType)) pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType))
pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile
pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList
pipelineSpec.evalSpec.dbsnp = variantEvalDbsnpFile pipelineSpec.evalSpec.dbsnp = dataset.pipeline.getProject.getEvalDbsnp
pipelineSpec.evalSpec.validations = dataset.validations pipelineSpec.evalSpec.validations = dataset.validations
// Run the test, at least checking if the command compiles // Run the test, at least checking if the command compiles

View File

@ -32,19 +32,22 @@ FIREHOSE_ANNOTATIONS=(reference_file interval_list \
# YAML templates # YAML templates
# Project YAML template, once per file.
PROJECT_YAML_TEMPLATE='"\n\ PROJECT_YAML_TEMPLATE='"\n\
project: {\n\ project: {\n\
name: '"$ENTITY_SET_ID"',\n\ name: '"$ENTITY_SET_ID"',\n\
referenceFile: %s,\n\ referenceFile: %s,\n\
dbsnpFile: %s,\n\ genotypeDbsnpFile: %s,\n\
evalDbsnpFile: %s,\n\
refseqTable: %s,\n\ refseqTable: %s,\n\
intervalList: %s\n\ intervalList: %s\n\
},", $1, dbsnp, refseq, $2' },", $1, genotypeDbsnp, evalDbsnp, refseq, $2'
# Project YAML template, once per sample.
SAMPLE_YAML_TEMPLATE='"\n\ SAMPLE_YAML_TEMPLATE='"\n\
{\n\ {\n\
id: %s,\n\ id: %s,\n\
bamFiles: { recalibrated: %s },\n\ bamFiles: { cleaned: %s },\n\
tags: {\n\ tags: {\n\
SQUIDProject: %s,\n\ SQUIDProject: %s,\n\
CollaboratorID: %s\n\ CollaboratorID: %s\n\
@ -86,20 +89,27 @@ BEGIN {
refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/"; refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/"; dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";
dbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod"; # add hg18 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt"; refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";
dbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf"; # add hg19 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt"; refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";
printf "{" printf "{"
} }
{ {
if (NR == 1) { if (NR == 1) {
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
reference_part_count = split($1, reference_parts, "/") reference_part_count = split($1, reference_parts, "/")
reference_name = reference_parts[reference_part_count]; reference_name = reference_parts[reference_part_count];
dbsnp = dbsnps[reference_name]; genotypeDbsnp = genotypeDbsnps[reference_name];
evalDbsnp = evalDbsnps[reference_name];
refseq = refseqs[reference_name]; refseq = refseqs[reference_name];
printf '"$PROJECT_YAML_TEMPLATE"' printf '"$PROJECT_YAML_TEMPLATE"'
@ -115,12 +125,14 @@ END {
print "\n}" print "\n}"
}' > $PIPELINE_YAML_FILE }' > $PIPELINE_YAML_FILE
hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE` #hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
if [ "$hg19" -ne 0 ]; then # NOTE: DBSNP's are populated via AWK's BEGIN block above.
sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2 #if [ "$hg19" -ne 0 ]; then
mv yaml2 $PIPELINE_YAML_FILE # sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
fi # mv yaml2 $PIPELINE_YAML_FILE
#fi
sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2 # NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
mv yaml2 $PIPELINE_YAML_FILE #sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
#mv yaml2 $PIPELINE_YAML_FILE

View File

@ -2,7 +2,6 @@
SAMPLE_SET=$1 SAMPLE_SET=$1
STING_HOME=/humgen/gsa-pipeline/.repository STING_HOME=/humgen/gsa-pipeline/.repository
REFSEQ_TABLE=/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg19.txt
JOB_QUEUE=week JOB_QUEUE=week
SHORT_QUEUE=hour SHORT_QUEUE=hour
TMP_DIR=$PWD/tmp TMP_DIR=$PWD/tmp
@ -14,4 +13,4 @@ use R-2.10
use Oracle-full-client use Oracle-full-client
use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1 use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1
java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/fullCallingPipeline.q -Y "$SAMPLE_SET".yaml -refseqTable "$REFSEQ_TABLE" --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2 java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2