Renamed the pipeline yaml dbsnpFile to genotypeDbsnp, and added an evalDbsnp.

Added a genotypeDbsnpType and evalDbsnpType to check the extensions for .vcf or .rod.
Moved renaming of "recalibrated" bams to "cleaned" from sed to yaml generation template (see diff for more info).
Renamed fCP.q to FCP.q.
Though it's still disabled until VariantEval is updated, added changes above to the FCPTest.
Removed refseq table from the queue.sh wrapper script. Only specified in the yaml.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5213 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-02-07 22:01:09 +00:00
parent bceb2a9460
commit 8040998c15
5 changed files with 77 additions and 45 deletions

View File

@ -35,7 +35,8 @@ public class PipelineProject {
private String name;
private File referenceFile;
private File intervalList;
private File dbsnpFile;
private File genotypeDbsnp;
private File evalDbsnp;
private File refseqTable;
private Map<String, String> tags = new TreeMap<String, String>();
@ -63,12 +64,40 @@ public class PipelineProject {
this.referenceFile = referenceFile;
}
/** use genotype dbsnp file */
@Deprecated
public File getDbsnpFile() {
return dbsnpFile;
return getGenotypeDbsnp();
}
/** use genotype dbsnp file */
@Deprecated
public void setDbsnpFile(File dbsnpFile) {
this.dbsnpFile = dbsnpFile;
this.setGenotypeDbsnp(dbsnpFile);
}
public File getGenotypeDbsnp() {
return genotypeDbsnp;
}
public void setGenotypeDbsnp(File genotypeDbsnp) {
this.genotypeDbsnp = genotypeDbsnp;
}
public String getGenotypeDbsnpType() {
return getDbsnpType(genotypeDbsnp);
}
public File getEvalDbsnp() {
return evalDbsnp;
}
public void setEvalDbsnp(File evalDbsnp) {
this.evalDbsnp = evalDbsnp;
}
public String getEvalDbsnpType() {
return getDbsnpType(genotypeDbsnp);
}
public File getRefseqTable() {
@ -86,4 +115,13 @@ public class PipelineProject {
public void setTags(Map<String, String> tags) {
this.tags = tags;
}
private String getDbsnpType(File file) {
if (file == null)
return null;
else if (file.getName().toLowerCase().endsWith(".vcf"))
return "vcf";
else
return "dbsnp";
}
}

View File

@ -10,7 +10,7 @@ import org.broadinstitute.sting.queue.QScript
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
class fullCallingPipeline extends QScript {
class FullCallingPipeline extends QScript {
qscript =>
@Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
@ -64,8 +64,6 @@ class fullCallingPipeline extends QScript {
private var pipeline: Pipeline = _
private var dbsnpType: String = _
trait CommandLineGATKArgs extends CommandLineGATK {
this.intervals = List(qscript.pipeline.getProject.getIntervalList)
this.jarFile = qscript.gatkJar
@ -79,12 +77,6 @@ class fullCallingPipeline extends QScript {
def script = {
pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
//var dbsnpType: String = _ //figure out how to get it so this is recognized and the whole thing has access to it.
if (qscript.pipeline.getProject.getDbsnpFile.toString.contains("rod")){
dbsnpType = "dbSNP"
} else {
dbsnpType = "VCF"
}
val projectBase: String = qscript.pipeline.getProject.getName
// TODO: Fix command lines that pass -refseqTable
@ -131,7 +123,7 @@ class fullCallingPipeline extends QScript {
realigner.intervals = Nil
realigner.intervalsString = Nil
realigner.scatterCount = num_cleaner_scatter_jobs
realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
realigner.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf"))
// if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates
@ -214,7 +206,7 @@ class fullCallingPipeline extends QScript {
snps.group :+= "Standard"
snps.out = new File("SnpCalls", base+".vcf")
snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
snps.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
snps.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
snps.memoryLimit = Some(6)
snps.scatterCount = qscript.num_snp_scatter_jobs
@ -242,7 +234,7 @@ class fullCallingPipeline extends QScript {
indels.group :+= "Standard"
indels.out = new File("IndelCalls", base+".vcf")
indels.downsample_to_coverage = Some(qscript.downsampling_coverage)
indels.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
indels.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getGenotypeDbsnpType, qscript.pipeline.getProject.getGenotypeDbsnp)
indels.memoryLimit = Some(6)
indels.genotype_likelihoods_model = Option(org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.DINDEL)
@ -308,12 +300,7 @@ class fullCallingPipeline extends QScript {
eval.reportLocation = new File("SnpCalls", base+".eval")
eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
eval.analysisName = base+"_VariantEval"
if(dbsnpType=="VCF"){
eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
} else{
eval.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile)
}
eval.rodBind :+= RodBind("dbsnp", qscript.pipeline.getProject.getEvalDbsnpType, qscript.pipeline.getProject.getEvalDbsnp)
// 5. Make the bam list
val listOfBams = new File(base +".BamFiles.list")

View File

@ -23,11 +23,6 @@ class FullCallingPipelineTest {
new K1gBam("C474", "NA19652", 1),
new K1gBam("C474", "NA19654", 1))
// In fullCallingPipeline.q VariantEval is always compared against 129.
// Until the newvarianteval is finalized which will allow java import of the prior results,
// we re-run VariantEval to validate the run, and replicate that behavior here.
private final val variantEvalDbsnpFile = new File(BaseTest.b37dbSNP129)
val k1gChr20Dataset = {
val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20")
dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"))
@ -64,7 +59,8 @@ class FullCallingPipelineTest {
val project = new PipelineProject
project.setName(projectName)
project.setReferenceFile(new File(BaseTest.hg19Reference))
project.setDbsnpFile(new File(BaseTest.b37dbSNP132))
project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132))
project.setEvalDbsnp(new File(BaseTest.b37dbSNP129))
project.setRefseqTable(new File(BaseTest.hg19Refseq))
var samples = List.empty[PipelineSample]
@ -94,12 +90,12 @@ class FullCallingPipelineTest {
@Test(dataProvider="datasets", enabled=false)
def testFullCallingPipeline(dataset: PipelineDataset) = {
val projectName = dataset.pipeline.getProject.getName
val testName = "fullCallingPipeline-" + projectName
val testName = "FullCallingPipeline-" + projectName
val yamlFile = writeYaml(testName, dataset.pipeline)
var cleanType = "cleaned"
// Run the pipeline with the expected inputs.
var pipelineCommand = ("-retry 1 -S scala/qscript/playground/fullCallingPipeline.q" +
var pipelineCommand = ("-retry 1 -S scala/qscript/playground/FullCallingPipeline.q" +
" -jobProject %s -Y %s" +
" -tearScript %s/R/DataProcessingReport/GetTearsheetStats.R" +
" --gatkjar %s")
@ -119,7 +115,7 @@ class FullCallingPipelineTest {
pipelineSpec.evalSpec.vcf = new File(PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType))
pipelineSpec.evalSpec.reference = dataset.pipeline.getProject.getReferenceFile
pipelineSpec.evalSpec.intervals = dataset.pipeline.getProject.getIntervalList
pipelineSpec.evalSpec.dbsnp = variantEvalDbsnpFile
pipelineSpec.evalSpec.dbsnp = dataset.pipeline.getProject.getEvalDbsnp
pipelineSpec.evalSpec.validations = dataset.validations
// Run the test, at least checking if the command compiles

View File

@ -32,19 +32,22 @@ FIREHOSE_ANNOTATIONS=(reference_file interval_list \
# YAML templates
# Project YAML template, once per file.
PROJECT_YAML_TEMPLATE='"\n\
project: {\n\
name: '"$ENTITY_SET_ID"',\n\
referenceFile: %s,\n\
dbsnpFile: %s,\n\
genotypeDbsnpFile: %s,\n\
evalDbsnpFile: %s,\n\
refseqTable: %s,\n\
intervalList: %s\n\
},", $1, dbsnp, refseq, $2'
},", $1, genotypeDbsnp, evalDbsnp, refseq, $2'
# Project YAML template, once per sample.
SAMPLE_YAML_TEMPLATE='"\n\
{\n\
id: %s,\n\
bamFiles: { recalibrated: %s },\n\
bamFiles: { cleaned: %s },\n\
tags: {\n\
SQUIDProject: %s,\n\
CollaboratorID: %s\n\
@ -86,20 +89,27 @@ BEGIN {
refseq_dir = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
dbsnp_dir = "/humgen/gsa-hpprojects/GATK/data/";
dbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
# add hg18 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
evalDbsnps["Homo_sapiens_assembly18.fasta"] = dbsnp_dir "dbsnp_129_hg18.rod";
refseqs["Homo_sapiens_assembly18.fasta"] = refseq_dir "refGene-big-table-hg18.txt";
dbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
# add hg19 specific files to awk associative arrays
genotypeDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_132_b37.vcf";
evalDbsnps["Homo_sapiens_assembly19.fasta"] = dbsnp_dir "dbsnp_129_b37.rod";
refseqs["Homo_sapiens_assembly19.fasta"] = refseq_dir "refGene-big-table-hg19.txt";
printf "{"
}
{
if (NR == 1) {
# Based on the reference of the first sample, specify the dbsnps and refseq tables.
reference_part_count = split($1, reference_parts, "/")
reference_name = reference_parts[reference_part_count];
dbsnp = dbsnps[reference_name];
genotypeDbsnp = genotypeDbsnps[reference_name];
evalDbsnp = evalDbsnps[reference_name];
refseq = refseqs[reference_name];
printf '"$PROJECT_YAML_TEMPLATE"'
@ -115,12 +125,14 @@ END {
print "\n}"
}' > $PIPELINE_YAML_FILE
hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
#hg19=`grep "assembly19" -c $PIPELINE_YAML_FILE`
if [ "$hg19" -ne 0 ]; then
sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
mv yaml2 $PIPELINE_YAML_FILE
fi
# NOTE: DBSNP's are populated via AWK's BEGIN block above.
#if [ "$hg19" -ne 0 ]; then
# sed 's/\/humgen.*rod/\/humgen\/gsa-hpprojects\/GATK\/data\/dbsnp_132_b37.vcf/' $PIPELINE_YAML_FILE > yaml2
# mv yaml2 $PIPELINE_YAML_FILE
#fi
sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
mv yaml2 $PIPELINE_YAML_FILE
# NOTE: Renamed "recalibrated" to "cleaned" in SAMPLE_YAML_TEMPLATE above.
#sed 's/recalibrat/clean/' $PIPELINE_YAML_FILE > yaml2
#mv yaml2 $PIPELINE_YAML_FILE

View File

@ -2,7 +2,6 @@
SAMPLE_SET=$1
STING_HOME=/humgen/gsa-pipeline/.repository
REFSEQ_TABLE=/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/refGene-big-table-hg19.txt
JOB_QUEUE=week
SHORT_QUEUE=hour
TMP_DIR=$PWD/tmp
@ -14,4 +13,4 @@ use R-2.10
use Oracle-full-client
use .cx-oracle-5.0.2-python-2.6.5-oracle-full-client-11.1
java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/fullCallingPipeline.q -Y "$SAMPLE_SET".yaml -refseqTable "$REFSEQ_TABLE" --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2
java -Djava.io.tmpdir="$TMP_DIR" -jar "$STING_HOME"/dist/Queue.jar -jobQueue "$JOB_QUEUE" -shortJobQueue "$SHORT_QUEUE" -jobProject "$SAMPLE_SET" -jobPrefix "$SAMPLE_SET" -tearScript "$STING_HOME"/R/DataProcessingReport/GetTearsheetStats.R -S "$STING_HOME"/scala/qscript/playground/FullCallingPipeline.q -Y "$SAMPLE_SET".yaml --gatkjar "$STING_HOME"/dist/GenomeAnalysisTK.jar -log queue_log.txt -statusTo corin -bsub $2