diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 2014801e4..5f6865d04 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -276,13 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( elt.isReducedRead() ) { // reduced read representation byte qual = elt.getReducedQual(); - for ( int i = 0; i < elt.getReducedCount(); i++ ) { - add(obsBase, qual, (byte)0, (byte)0); - } - return elt.getQual(); + add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods + return elt.getReducedCount(); // we added nObs bases here } else { byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0; + return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } } @@ -309,9 +307,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * @param qual1 * @param obsBase2 * @param qual2 can be 0, indicating no second base was observed for this fragment + * @param nObs The number of times this quad of values was seen. Generally 1, but reduced reads + * can have nObs > 1 for synthetic reads * @return */ - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. @@ -332,19 +332,17 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { for ( DiploidGenotype g : DiploidGenotype.values() ) { double likelihood = likelihoods[g.ordinal()]; - - //if ( VERBOSE ) { - // System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", - // observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood); - //} - - log10Likelihoods[g.ordinal()] += likelihood; - log10Posteriors[g.ordinal()] += likelihood; + log10Likelihoods[g.ordinal()] += likelihood * nObs; + log10Posteriors[g.ordinal()] += likelihood * nObs; } return 1; } + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + return add(obsBase1, qual1, obsBase2, qual2, 1); + } + // ------------------------------------------------------------------------------------- // // Dealing with the cache routines diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index cbe53db8d..9f3dd9a2c 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -3,6 +3,10 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.util.QScriptUtils +import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.commandline.Hidden /** * Created by IntelliJ IDEA. @@ -14,52 +18,132 @@ import org.broadinstitute.sting.queue.util.QScriptUtils class RecalibrateBaseQualities extends QScript { - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true) var input: File = _ @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) var R: String = _ @Input(doc="Reference fasta file", shortName="R", required=true) - var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + var reference: File = _ + + @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) + var dbSNP: File = _ + + @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + var threads: Int = 0 + + @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + var sample: String = "NA" + + @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) + var bwaPath: File = _ + + @Hidden + @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + var dbq: Int = 20 + + + - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true) - var dbSNP: File = _ // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") val queueLogDir: String = ".qlog/" - var nContigs: Int = 0 def script = { - val bamList = QScriptUtils.createListFromFile(input) - nContigs = QScriptUtils.getNumberOfContigs(bamList(0)) + val fileList: List[File] = QScriptUtils.createListFromFile(input) - for (bam <- bamList) { + for (file: File <- fileList) { - val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") - val recalBam: File = swapExt(bam, ".bam", ".recal.bam") + var USE_BWA: Boolean = false + + println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName) + + if (file.endsWith(".fasta") || file.endsWith(".fq")) { + if (bwaPath == null) { + throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); + } + USE_BWA = true + } + + // FASTA -> BAM steps + val alignedSam: File = file.getName + ".aligned.sam" + val sortedBam: File = swapExt(alignedSam, ".sam", ".bam") + val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam") + val rgBam: File = swapExt(file, ".bam", ".rg.bam") + + val bamBase = if (USE_BWA) {rgBam} else {file} + + // BAM Steps + val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam") val path1: String = recalBam + ".before" val path2: String = recalBam + ".after" - add(cov(bam, recalFile1), - recal(bam, recalFile1, recalBam), + + if (USE_BWA) { + add(align(file, alignedSam), + sortSam(alignedSam, sortedBam), + addQuals(sortedBam, qualBam, dbq), + addReadGroup(qualBam, rgBam, sample)) + } + + add(cov(bamBase, recalFile1), + recal(bamBase, recalFile1, recalBam), cov(recalBam, recalFile2), analyzeCovariates(recalFile1, path1), analyzeCovariates(recalFile2, path2)) } } - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = GATKjar - this.reference_sequence = reference + + // General arguments to non-GATK tools + trait ExternalCommonArgs extends CommandLineFunction { this.memoryLimit = 4 this.isIntermediate = true } + trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { + this.reference_sequence = reference + } + + + case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs { + def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam + this.analysisName = queueLogDir + outSam + ".bwa_sam_se" + this.jobName = queueLogDir + outSam + ".bwa_sam_se" + } + + case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") + this.input = List(inSam) + this.output = outBam + this.sortOrder = SortOrder.coordinate + this.analysisName = queueLogDir + outBam + ".sortSam" + this.jobName = queueLogDir + outBam + ".sortSam" + } + + case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs { + this.input_file :+= inBam + this.out = outBam + this.DBQ = qual + } + + case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") + this.input = List(inBam) + this.output = outBam + this.RGID = "1" + this.RGCN = "BI" + this.RGPL = "PacBio_RS" + this.RGSM = sample + this.RGLB = "default_library" + this.RGPU = "default_pu" + this.analysisName = queueLogDir + outBam + ".rg" + this.jobName = queueLogDir + outBam + ".rg" + } + case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") @@ -67,7 +151,7 @@ class RecalibrateBaseQualities extends QScript { this.recal_file = outRecalFile this.analysisName = queueLogDir + outRecalFile + ".covariates" this.jobName = queueLogDir + outRecalFile + ".covariates" - this.scatterCount = nContigs + this.scatterCount = threads } case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { @@ -77,7 +161,7 @@ class RecalibrateBaseQualities extends QScript { this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" - this.scatterCount = nContigs + this.scatterCount = threads } case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 99aaa9474..12bd880d8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -22,15 +22,15 @@ object QScriptUtils { * to have empty lines and comment lines (lines starting with #). */ def createListFromFile(in: File):List[File] = { - // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file. // and return a list with only this file. - if (in.toString.endsWith(".bam")) + if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq")) return List(in) var list: List[File] = List() - for (bam <- fromFile(in).getLines) - if (!bam.startsWith("#") && !bam.isEmpty ) - list :+= new File(bam.trim()) + for (file <- fromFile(in).getLines) + if (!file.startsWith("#") && !file.isEmpty ) + list :+= new File(file.trim()) list.sortWith(_.compareTo(_) < 0) }