Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2011-08-09 23:19:51 -04:00
commit b8f572b571
3 changed files with 122 additions and 40 deletions

View File

@ -276,13 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
if ( elt.isReducedRead() ) {
// reduced read representation
byte qual = elt.getReducedQual();
for ( int i = 0; i < elt.getReducedCount(); i++ ) {
add(obsBase, qual, (byte)0, (byte)0);
}
return elt.getQual();
add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods
return elt.getReducedCount(); // we added nObs bases here
} else {
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
}
}
@ -309,9 +307,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
* @param qual1
* @param obsBase2
* @param qual2 can be 0, indicating no second base was observed for this fragment
* @param nObs The number of times this quad of values was seen. Generally 1, but reduced reads
* can have nObs > 1 for synthetic reads
* @return
*/
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
@ -332,19 +332,17 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double likelihood = likelihoods[g.ordinal()];
//if ( VERBOSE ) {
// System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
// observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood);
//}
log10Likelihoods[g.ordinal()] += likelihood;
log10Posteriors[g.ordinal()] += likelihood;
log10Likelihoods[g.ordinal()] += likelihood * nObs;
log10Posteriors[g.ordinal()] += likelihood * nObs;
}
return 1;
}
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
return add(obsBase1, qual1, obsBase2, qual2, 1);
}
// -------------------------------------------------------------------------------------
//
// Dealing with the cache routines

View File

@ -3,6 +3,10 @@ package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.util.QScriptUtils
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
import org.broadinstitute.sting.utils.exceptions.UserException
import org.broadinstitute.sting.commandline.Hidden
/**
* Created by IntelliJ IDEA.
@ -14,52 +18,132 @@ import org.broadinstitute.sting.queue.util.QScriptUtils
class RecalibrateBaseQualities extends QScript {
@Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
var GATKjar: File = _
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
@Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
var input: File = _
@Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true)
var R: String = _
@Input(doc="Reference fasta file", shortName="R", required=true)
var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
var reference: File = _
@Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
var dbSNP: File = _
@Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
var threads: Int = 0
@Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
var sample: String = "NA"
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Hidden
@Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
var dbq: Int = 20
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true)
var dbSNP: File = _ // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0
def script = {
val bamList = QScriptUtils.createListFromFile(input)
nContigs = QScriptUtils.getNumberOfContigs(bamList(0))
val fileList: List[File] = QScriptUtils.createListFromFile(input)
for (bam <- bamList) {
for (file: File <- fileList) {
val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv")
val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv")
val recalBam: File = swapExt(bam, ".bam", ".recal.bam")
var USE_BWA: Boolean = false
println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
if (file.endsWith(".fasta") || file.endsWith(".fq")) {
if (bwaPath == null) {
throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
}
USE_BWA = true
}
// FASTA -> BAM steps
val alignedSam: File = file.getName + ".aligned.sam"
val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
val rgBam: File = swapExt(file, ".bam", ".rg.bam")
val bamBase = if (USE_BWA) {rgBam} else {file}
// BAM Steps
val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam")
val path1: String = recalBam + ".before"
val path2: String = recalBam + ".after"
add(cov(bam, recalFile1),
recal(bam, recalFile1, recalBam),
if (USE_BWA) {
add(align(file, alignedSam),
sortSam(alignedSam, sortedBam),
addQuals(sortedBam, qualBam, dbq),
addReadGroup(qualBam, rgBam, sample))
}
add(cov(bamBase, recalFile1),
recal(bamBase, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
}
}
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = GATKjar
this.reference_sequence = reference
// General arguments to non-GATK tools
trait ExternalCommonArgs extends CommandLineFunction {
this.memoryLimit = 4
this.isIntermediate = true
}
trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
this.reference_sequence = reference
}
case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam
this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
this.jobName = queueLogDir + outSam + ".bwa_sam_se"
}
case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs {
@Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
this.input = List(inSam)
this.output = outBam
this.sortOrder = SortOrder.coordinate
this.analysisName = queueLogDir + outBam + ".sortSam"
this.jobName = queueLogDir + outBam + ".sortSam"
}
case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs {
this.input_file :+= inBam
this.out = outBam
this.DBQ = qual
}
case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
@Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
this.input = List(inBam)
this.output = outBam
this.RGID = "1"
this.RGCN = "BI"
this.RGPL = "PacBio_RS"
this.RGSM = sample
this.RGLB = "default_library"
this.RGPU = "default_pu"
this.analysisName = queueLogDir + outBam + ".rg"
this.jobName = queueLogDir + outBam + ".rg"
}
case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
@ -67,7 +151,7 @@ class RecalibrateBaseQualities extends QScript {
this.recal_file = outRecalFile
this.analysisName = queueLogDir + outRecalFile + ".covariates"
this.jobName = queueLogDir + outRecalFile + ".covariates"
this.scatterCount = nContigs
this.scatterCount = threads
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
@ -77,7 +161,7 @@ class RecalibrateBaseQualities extends QScript {
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
this.scatterCount = nContigs
this.scatterCount = threads
}
case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {

View File

@ -22,15 +22,15 @@ object QScriptUtils {
* to have empty lines and comment lines (lines starting with #).
*/
def createListFromFile(in: File):List[File] = {
// If the file provided ends with .bam, it is not a bam list, we treat it as a single file.
// If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file.
// and return a list with only this file.
if (in.toString.endsWith(".bam"))
if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq"))
return List(in)
var list: List[File] = List()
for (bam <- fromFile(in).getLines)
if (!bam.startsWith("#") && !bam.isEmpty )
list :+= new File(bam.trim())
for (file <- fromFile(in).getLines)
if (!file.startsWith("#") && !file.isEmpty )
list :+= new File(file.trim())
list.sortWith(_.compareTo(_) < 0)
}