diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R index 18d33054e..a24d269c9 100644 --- a/public/R/queueJobReport.R +++ b/public/R/queueJobReport.R @@ -12,7 +12,9 @@ if ( onCMDLine ) { inputFileName = args[1] outputPDF = args[2] } else { - inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" + #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" + inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" + #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" outputPDF = NA } @@ -113,11 +115,22 @@ plotGroup <- function(groupTable) { textplot(as.data.frame(sum), show.rownames=F) title(paste("Job summary for", name, "itemizing each iteration"), cex=3) + # histogram of job times by groupAnnotations + if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) { + # todo -- how do we group by annotations? + p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram() + p <- p + xlab("runtime in seconds") + ylab("No. of jobs") + p <- p + opts(title=paste("Job runtime histogram for", name)) + print(p) + } + # as above, but averaging over all iterations groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration") - sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd)) - textplot(as.data.frame(sum), show.rownames=F) - title(paste("Job summary for", name, "averaging over all iterations"), cex=3) + if ( dim(sub)[1] > 1 ) { + sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd)) + textplot(as.data.frame(sum), show.rownames=F) + title(paste("Job summary for", name, "averaging over all iterations"), cex=3) + } } # print out some useful basic information @@ -146,7 +159,7 @@ plotJobsGantt(gatkReportData, T) plotJobsGantt(gatkReportData, F) plotProgressByTime(gatkReportData) for ( group in gatkReportData ) { - plotGroup(group) + plotGroup(group) } if ( ! is.na(outputPDF) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index ae419a5c4..7b8045581 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -52,6 +52,11 @@ public class UnifiedArgumentCollection { @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + /** + * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily + * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it + * effectively acts as a cap on the base qualities. + */ @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index ceafb0cf5..dba80d44d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -214,7 +214,7 @@ public class SelectVariants extends RodWalker { @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) public Set sampleExpressions ; - @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) + @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) public Set sampleFiles; /** @@ -226,7 +226,7 @@ public class SelectVariants extends RodWalker { /** * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. */ - @Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) + @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) public Set XLsampleFiles = new HashSet(0); /** diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index faa3cf34e..bc200372f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -252,7 +252,8 @@ public class ClippingOp { if (start == 0 && stop == read.getReadLength() -1) return new SAMRecord(read.getHeader()); - CigarShift cigarShift = hardClipCigar(read.getCigar(), start, stop); + // If the read is unmapped there is no Cigar string and neither should we create a new cigar string + CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); // the cigar may force a shift left or right (or both) in case we are left with insertions // starting or ending the read after applying the hard clip on start/stop. diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 66e1afecb..12899e898 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -82,7 +82,7 @@ public class PileupElement { // -------------------------------------------------------------------------- private Integer getReducedReadQualityTagValue() { - return (Integer)getRead().getAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); + return getRead().getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); } public boolean isReducedRead() { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 517f9f75d..c55a462f1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -41,6 +41,10 @@ public class GATKSAMRecord extends SAMRecord { // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; + /** A private cache for the reduced read quality. Null indicates the value hasn't be fetched yet or isn't available */ + private boolean lookedUpReducedReadQuality = false; + private Integer reducedReadQuality; + // These temporary attributes were added here to make life easier for // certain algorithms by providing a way to label or attach arbitrary data to // individual GATKSAMRecords. @@ -338,7 +342,17 @@ public class GATKSAMRecord extends SAMRecord { public Object getAttribute(final String tag) { return mRecord.getAttribute(tag); } - public Integer getIntegerAttribute(final String tag) { return mRecord.getIntegerAttribute(tag); } + public Integer getIntegerAttribute(final String tag) { + if ( tag == ReadUtils.REDUCED_READ_QUALITY_TAG ) { + if ( ! lookedUpReducedReadQuality ) { + lookedUpReducedReadQuality = true; + reducedReadQuality = mRecord.getIntegerAttribute(tag); + } + return reducedReadQuality; + } else { + return mRecord.getIntegerAttribute(tag); + } + } public Short getShortAttribute(final String tag) { return mRecord.getShortAttribute(tag); } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 2417e5620..2a135496d 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -11,7 +11,7 @@ import net.sf.samtools.SAMFileReader import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.util.QScriptUtils -import org.broadinstitute.sting.queue.function.{CommandLineFunction, ListWriterFunction} +import org.broadinstitute.sting.queue.function.ListWriterFunction class DataProcessingPipeline extends QScript { qscript => @@ -31,7 +31,7 @@ class DataProcessingPipeline extends QScript { var reference: File = _ @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true) - var dbSNP: File = _ + var dbSNP: List[File] = List() /**************************************************************************** * Optional Parameters @@ -43,7 +43,7 @@ class DataProcessingPipeline extends QScript { // @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) - var indels: File = _ + var indels: List[File] = List() @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ @@ -159,7 +159,7 @@ class DataProcessingPipeline extends QScript { for (rg <- readGroups) { val intermediateInBam: File = if (index == readGroups.length) { inBam } else { swapExt(outBam, ".bam", index+1 + "-rg.bam") } val intermediateOutBam: File = if (index > 1) {swapExt(outBam, ".bam", index + "-rg.bam") } else { outBam} - val readGroup = new ReadGroup(rg.getReadGroupId, rg.getPlatform, rg.getLibrary, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription) + val readGroup = new ReadGroup(rg.getReadGroupId, rg.getLibrary, rg.getPlatform, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription) add(addReadGroup(intermediateInBam, intermediateOutBam, readGroup)) index = index - 1 } @@ -321,9 +321,9 @@ class DataProcessingPipeline extends QScript { this.input_file = inBams this.out = outIntervals this.mismatchFraction = 0.0 - this.known :+= qscript.dbSNP + this.known ++= qscript.dbSNP if (indels != null) - this.known :+= qscript.indels + this.known ++= qscript.indels this.scatterCount = nContigs this.analysisName = queueLogDir + outIntervals + ".target" this.jobName = queueLogDir + outIntervals + ".target" @@ -333,9 +333,9 @@ class DataProcessingPipeline extends QScript { this.input_file = inBams this.targetIntervals = tIntervals this.out = outBam - this.known :+= qscript.dbSNP + this.known ++= qscript.dbSNP if (qscript.indels != null) - this.known :+= qscript.indels + this.known ++= qscript.indels this.consensusDeterminationModel = cleanModelEnum this.compress = 0 this.scatterCount = nContigs @@ -344,7 +344,7 @@ class DataProcessingPipeline extends QScript { } case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.knownSites :+= qscript.dbSNP + this.knownSites ++= qscript.dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 3c9a3fbcb..80bfe03d1 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -1,6 +1,5 @@ package org.broadinstitute.sting.queue.qscripts -import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.gatk.phonehome.GATKRunReport @@ -70,7 +69,8 @@ class MethodsDevelopmentCallingPipeline extends QScript { val goldStandardClusterFile = new File(goldStandardName + ".clusters") } - val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + val b37_decoy = new File("/humgen/1kg/reference/human_g1k_v37_decoy.fasta") + val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") @@ -124,6 +124,14 @@ class MethodsDevelopmentCallingPipeline extends QScript { new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), + "WExTrioDecoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"), + new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3), + "WGSTrioDecoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"), + new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **