diff --git a/build.xml b/build.xml index d3e25d424..9a66d4699 100644 --- a/build.xml +++ b/build.xml @@ -955,8 +955,8 @@ - - + + diff --git a/LICENSE b/licensing/GATK1_LICENSE similarity index 96% rename from LICENSE rename to licensing/GATK1_LICENSE index 634096e2b..648ec8fc3 100644 --- a/LICENSE +++ b/licensing/GATK1_LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011 The Broad Institute +Copyright (c) 2012 The Broad Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation diff --git a/licensing/GATK2_beta_license.doc b/licensing/GATK2_beta_license.doc new file mode 100644 index 000000000..4fa04a3f6 Binary files /dev/null and b/licensing/GATK2_beta_license.doc differ diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R index 876cf5cbc..4c228ccb4 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R @@ -2,19 +2,19 @@ .gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) { d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE); colnames(d) = tableHeader; - + for (i in 1:ncol(d)) { # use the general type.convert infrastructure of read.table to convert column data to R types v = type.convert(d[,i]) d[,i] = v; } - + usedNames = ls(envir=tableEnv, pattern=tableName); - + if (length(usedNames) > 0) { tableName = paste(tableName, ".", length(usedNames), sep=""); } - + assign(tableName, d, envir=tableEnv); } @@ -28,74 +28,163 @@ starts = c(1, columnStarts); stops = c(columnStarts - 1, nchar(line)); - + sapply(line, splitStartStop)[,1]; } +# Old implementaton for v0.* +gsa.read.gatkreportv0 <- function(lines) { + + tableEnv = new.env(); + + tableName = NA; + tableHeader = c(); + tableRows = c(); + version = NA; + + for (line in lines) { + if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) { + headerFields = unlist(strsplit(line, "[[:space:]]+")); + + if (!is.na(tableName)) { + .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); + } + + tableName = headerFields[2]; + tableHeader = c(); + tableRows = c(); + + # For differences in versions see + # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java + if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.1"; + + } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.2"; + columnStarts = c(); + + } + + } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) { + # do nothing + } else if (!is.na(tableName)) { + + if (version == "v0.1") { + row = unlist(strsplit(line, "[[:space:]]+")); + + } else if (version == "v0.2") { + if (length(tableHeader) == 0) { + headerChars = unlist(strsplit(line, "")); + # Find the first position of non space characters, excluding the first character + columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1); + } + + row = .gsa.splitFixedWidth(line, columnStarts); + } + + if (length(tableHeader) == 0) { + tableHeader = row; + } else { + tableRows = rbind(tableRows, row); + } + } + } + + if (!is.na(tableName)) { + .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); + } + + gatkreport = as.list(tableEnv, all.names=TRUE); +} + +# Load all GATKReport v1 tables from file +gsa.read.gatkreportv1 <- function(lines) { + #print("loading with optimized v1 reader") + nLines = length(lines) + tableEnv = new.env(); + + tableName = NA; + tableHeader = c(); + tableRows = NULL; + version = ""; + rowCount = 0 + headerRowCount = -1; + + finishTable <- function() { + .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv); + } + + for (line in lines) { + + if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) { + version = "v1.0"; + headerRowCount = 0; + } + + if ( (headerRowCount %% 2 == 1) && (version == "v1.0") ) { + #print("Trying to start a table with line:"); + #print(line); + + #Get table header + headerFields = unlist(strsplit(line, ":")); + + if (!is.na(tableName)) { + finishTable() + } + + tableName = headerFields[3]; + tableHeader = c(); + tableRows = NULL + rowCount = 0 + + columnStarts = c(); + } + + if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) { + headerRowCount = headerRowCount+1; + #print("Header Row count is at:") + #print(headerRowCount); + } else if (!is.na(tableName)) { + if ( version == "v1.0") { + if (length(tableHeader) == 0) { + headerChars = unlist(strsplit(line, "")); + # Find the first position of non space characters, excluding the first character + columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1); + tableRows = matrix(nrow=nLines, ncol=length(columnStarts)+1); + } + + row = .gsa.splitFixedWidth(line, columnStarts); + } + + if (length(tableHeader) == 0) { + tableHeader = row; + } else if ( nchar(line) > 0 ) { + rowCount = rowCount + 1 + tableRows[rowCount,] <- row + } + } + } + + if (!is.na(tableName)) { + finishTable() + } + + gatkreport = as.list(tableEnv, all.names=TRUE); +} + # Load all GATKReport tables from a file gsa.read.gatkreport <- function(filename) { con = file(filename, "r", blocking = TRUE); lines = readLines(con); close(con); - - tableEnv = new.env(); - - tableName = NA; - tableHeader = c(); - tableRows = c(); - version = NA; - - for (line in lines) { - if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) { - headerFields = unlist(strsplit(line, "[[:space:]]+")); - - if (!is.na(tableName)) { - .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); - } - - tableName = headerFields[2]; - tableHeader = c(); - tableRows = c(); - - # For differences in versions see - # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java - if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { - version = "v0.1"; - - } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) { - version = "v0.2"; - columnStarts = c(); - - } - - } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) { - # do nothing - } else if (!is.na(tableName)) { - - if (version == "v0.1") { - row = unlist(strsplit(line, "[[:space:]]+")); - - } else if (version == "v0.2") { - if (length(tableHeader) == 0) { - headerChars = unlist(strsplit(line, "")); - # Find the first position of non space characters, excluding the first character - columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1); - } - - row = .gsa.splitFixedWidth(line, columnStarts); - } - - if (length(tableHeader) == 0) { - tableHeader = row; - } else { - tableRows = rbind(tableRows, row); - } - } + + # get first line + line = lines[1]; + + if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) { + gsa.read.gatkreportv1(lines) } - - if (!is.na(tableName)) { - .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); + else if (length(grep("^##:GATKReport.v0", line, ignore.case=TRUE)) > 0) { + gsa.read.gatkreportv0(lines) } - - gatkreport = as.list(tableEnv, all.names=TRUE); } diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R new file mode 100644 index 000000000..19567e7e6 --- /dev/null +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R @@ -0,0 +1,244 @@ +library(gplots) +library(ggplot2) + +# ------------------------------------------------------- +# Utilities for displaying multiple plots per page +# ------------------------------------------------------- + +distributeGraphRows <- function(graphs, heights = c()) { + # Viewport layout 2 graphs top to bottom with given relative heights + # + # + if (length(heights) == 0) { + heights <- rep.int(1, length(graphs)) + } + heights <- heights[!is.na(graphs)] + graphs <- graphs[!is.na(graphs)] + numGraphs <- length(graphs) + Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights) + grid.newpage() + pushViewport(viewport(layout = Layout)) + subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1) + for (i in 1:numGraphs) { + print(graphs[[i]], vp = subplot(i)) + } +} + +distributeLogGraph <- function(graph, xName) { + continuousGraph <- graph + scale_x_continuous(xName) + logGraph <- graph + scale_x_log10(xName) + opts(title="") + distributeGraphRows(list(continuousGraph, logGraph)) +} + +distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) { + distributeGraphRows(list(perSampleGraph, distGraph), ratio) +} + +removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) { + # Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame + # + # Only keeps the column marked with "all" for each removed column + # + for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) { + if (toRemove %in% colnames(variantEvalDataFrame)) { + variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",] + } + } + variantEvalDataFrame +} + +openPDF <- function(outputPDF) { + # Open the outputPDF file with standard dimensions, if outputPDF is not NA + if ( ! is.na(outputPDF) ) { + pdf(outputPDF, height=8.5, width=11) + } +} + +closePDF <- function(outputPDF) { + # close the outputPDF file if not NA, and try to compact the PDF if possible + if ( ! is.na(outputPDF) ) { + dev.off() + if (exists("compactPDF")) { + compactPDF(outputPDF) + } + } +} + +makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) { + if ( is.null(widths) ) widths <- rep(1, length(ACs)) + + value = NULL + titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom) +} + +.reduceACs <- function(binWidthForAC, ACs) { + # computes data structures necessary to reduce the full range of ACs + # + # binWidthForAC returns the number of upcoming bins that should be merged into + # that AC bin. ACs is a vector of all AC values from 0 to 2N that should be + # merged together + # + # Returns a list containing the reduced ACs starts, their corresponding widths, + # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc) + maxAC <- max(ACs) + newACs <- c() + widths <- c() + newACMap <- c() + ac <- 0 + while ( ac < maxAC ) { + newACs <- c(newACs, ac) + width <- binWidthForAC(ac) + widths <- c(widths, width) + newACMap <- c(newACMap, rep(ac, width)) + ac <- ac + width + } + list(ACs = newACs, widths=widths, newACMap = newACMap) +} + +# geometricACs <- function(k, ACs) { +# nBins <- round(k * log10(max(ACs))) +# +# binWidthForAC <- function(ac) { +# max(ceiling(ac / nBins), 1) +# } +# +# return(reduceACs(binWidthForAC, ACs)) +# } + +reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) { + # map the full range of AC values onto a log linear scale + # + # Reduce the full AC range onto one where the width of each new AC increases at a rate of + # 10^scaleFactor in size with growing AC values. This is primarily useful for accurately + # computing ratios or other quantities by AC that aren't well determined when the AC + # values are very large + # + # Returns a list containing the reduced ACs starts, their corresponding widths, + # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc) + maxAC <- max(ACs) + afs <- ACs / maxAC + breaks <- 10^(seq(-4, -1, scaleFactor)) + widths <- c() + lastBreak <- 1 + for ( i in length(breaks):1 ) { + b <- breaks[i] + width <- sum(afs < lastBreak & afs >= b) + widths <- c(widths, width) + lastBreak <- b + } + widths <- rev(widths) + + binWidthForAC <- function(ac) { + af <- ac / maxAC + value = 1 + for ( i in length(breaks):1 ) + if ( af >= breaks[i] ) { + value = widths[i] + break + } + + return(value) + } + + return(.reduceACs(binWidthForAC, ACs)) +} + +.remapACs <- function(remapper, k, df) { + newACs <- remapper(k, df$AC) + + n = length(newACs$ACs) + num = rep(0, n) + denom = rep(0, n) + for ( i in 1:dim(df)[1] ) { + rowI = df$AC == i + row = df[rowI,] + newAC = newACs$newACMap[row$AC] + newRowI = newACs$ACs == newAC + num[newRowI] = num[newRowI] + df$num[rowI] + denom[newRowI] = denom[newRowI] + df$denom[rowI] + } + + newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths ) + newdf +} + +compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) { + df = makeRatioDataFrame(ACs, num, denom, 1) + return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df)) +} + +plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", + fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels", + onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T, + moreTitle="", note = NULL) { + metrics$strat = metrics[[requestedStrat]] + + otherFacet = "." + id.vars = c("strat", "nobs") + metrics$nobs <- metrics[[nObsField]] + + # keep track of the other strat and it's implied facet value + if (! is.null(anotherStrat)) { + id.vars = c(id.vars, anotherStrat) + otherFacet = anotherStrat + } + + molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures)) + perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable)) + + # create the title + titleText=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle) + if ( !is.null(note) ) { + titleText=paste(titleText, note, sep="\n") + } + paste(titleText) + title <- opts(title=titleText) + + determineFacet <- function(onX) { + if ( onX ) { + paste(otherFacet, "~ variable") + } else { + paste("variable ~", otherFacet) + } + } + + sampleFacet = determineFacet(facetVariableOnXPerSample) + distFacet = determineFacet(facetVariableOnXForDist) + + if ( requestedStrat == "Sample" ) { + perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale + perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") + } else { # by AlleleCount + perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs))) + perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount") + } + perSampleGraph <- perSampleGraph + ylab("Variable value") + title + perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free") + + nValues = length(unique(molten$value)) + if (nValues > 2) { + if ( requestedStrat == "Sample" ) { + distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable)) + } else { + distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs)) + } + distGraph <- distGraph + geom_histogram(aes(y=..ndensity..)) + distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..)) + distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter")) + scale = "free" + if ( fixHistogramX ) scale = "fixed" + distGraph <- distGraph + facet_grid(distFacet, scales=scale) + distGraph <- distGraph + ylab("Relative frequency") + distGraph <- distGraph + xlab("Variable value (see facet for variable by color)") + distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none") + } else { + distGraph <- NA + } + + if ( onSamePage ) { + suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph)) + } else { + suppressMessages(print(perSampleGraph)) + suppressMessages(print(distGraph + title)) + } +} diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R deleted file mode 100755 index 7af5e8bbb..000000000 --- a/public/R/titvFPEst.R +++ /dev/null @@ -1,138 +0,0 @@ -titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) } - -titvFPEstV <- function(titvExpected, titvs) { - sapply(titvs, function(x) titvFPEst(titvExpected, x)) -} - -calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) { - TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel - 2 * TP / 3 / callable -} - -marginalTiTv <- function( nx, titvx, ny, titvy ) { - tvx = nx / (titvx + 1) - tix = nx - tvx - tvy = ny / (titvy + 1) - tiy = ny - tvy - tiz = tix - tiy - tvz = tvx - tvy - return(tiz / tvz) -} -marginaldbSNPRate <- function( nx, dbx, ny, dby ) { - knownx = nx * dbx / 100 - novelx = nx - knownx - knowny = ny * dby / 100 - novely = ny - knowny - knownz = knownx - knowny - novelz = novelx - novely - return(knownz / ( knownz + novelz ) * 100) -} - -numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) { - nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals)) - return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls)) -} - -normalize <- function(x) { - x / sum(x) -} - -normcumsum <- function(x) { - cumsum(normalize(x)) -} - -cumhist <- function(d, ...) { - plot(d[order(d)], type="b", col="orange", lwd=2, ...) -} - -revcumsum <- function(x) { - return(rev(cumsum(rev(x)))) -} - -phred <- function(x) { - log10(max(x,10^(-9.9)))*-10 -} - -pOfB <- function(b, B, Q) { - #print(paste(b, B, Q)) - p = 1 - 10^(-Q/10) - if ( b == B ) - return(p) - else - return(1 - p) -} - -pOfG <- function(bs, qs, G) { - a1 = G[1] - a2 = G[2] - - log10p = 0 - for ( i in 1:length(bs) ) { - b = bs[i] - q = qs[i] - p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 - log10p = log10p + log10(p1) - } - - return(log10p) -} - -pOfGs <- function(nAs, nBs, Q) { - bs = c(rep("a", nAs), rep("t", nBs)) - qs = rep(Q, nAs + nBs) - G1 = c("a", "a") - G2 = c("a", "t") - G3 = c("t", "t") - - log10p1 = pOfG(bs, qs, G1) - log10p2 = pOfG(bs, qs, G2) - log10p3 = pOfG(bs, qs, G3) - Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) - - return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) -} - -QsampleExpected <- function(depth, Q) { - weightedAvg = 0 - for ( d in 1:(depth*3) ) { - Qsample = 0 - pOfD = dpois(d, depth) - for ( nBs in 0:d ) { - pOfnB = dbinom(nBs, d, 0.5) - nAs = d - nBs - Qsample = pOfGs(nAs, nBs, Q)$Qsample - #Qsample = 1 - weightedAvg = weightedAvg + Qsample * pOfD * pOfnB - print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) - } - } - - return(weightedAvg) -} - -plotQsamples <- function(depths, Qs, Qmax) { - cols = rainbow(length(Qs)) - plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") - - for ( i in 1:length(Qs) ) { - Q = Qs[i] - y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) - points(depths, y, col=cols[i], type="b") - } - - legend("topleft", paste("Q", Qs), fill=cols) -} - -pCallHetGivenDepth <- function(depth, nallelesToCall) { - depths = 0:(2*depth) - pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5))) - dpois(depths,depth)*(1-pNoAllelesToCall) -} - -pCallHets <- function(depth, nallelesToCall) { - sum(pCallHetGivenDepth(depth,nallelesToCall)) -} - -pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) { - 1-(1-pCallHets(depth,nallelesToCall))^nsamples -} diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java index 0c6096e0c..d1d616c97 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -83,12 +83,12 @@ public final class IntervalBinding { // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files - FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); + final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); if ( codec instanceof ReferenceDependentFeatureCodec ) ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser()); try { - FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource())); - AsciiLineReader lineReader = new AsciiLineReader(fis); + final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource())); + final AsciiLineReader lineReader = new AsciiLineReader(fis); codec.readHeader(lineReader); String line = lineReader.readLine(); while ( line != null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index e5aaf2338..c6bb4a27c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -103,21 +103,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram { argumentSources.add(walker); Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); - - // todo: remove me when the old style system is removed - if ( getArgumentCollection().RODBindings.size() > 0 ) { - logger.warn("################################################################################"); - logger.warn("################################################################################"); - logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2."); - logger.warn("Please use arguments defined by each specific walker instead."); - for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) { - logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags()); - } - logger.warn("################################################################################"); - logger.warn("################################################################################"); - System.exit(1); - } - engine.setReferenceMetaDataFiles(rodBindings); for (ReadFilter filter: filters) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 9c59ffe9a..70c6bc734 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -100,10 +100,11 @@ public class CommandLineGATK extends CommandLineExecutable { } catch(PicardException e) { // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions? exitSystemWithError(e); - } - catch (SAMException e) { + } catch (SAMException e) { checkForTooManyOpenFilesProblem(e.getMessage()); exitSystemWithSamError(e); + } catch (OutOfMemoryError e) { + exitSystemWithUserError(new UserException.NotEnoughMemory()); } catch (Throwable t) { checkForTooManyOpenFilesProblem(t.getMessage()); exitSystemWithError(t); diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 50ef4653b..039ca565a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.*; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; @@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; @@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; @@ -190,7 +192,7 @@ public class GenomeAnalysisEngine { private BaseRecalibration baseRecalibration = null; public BaseRecalibration getBaseRecalibration() { return baseRecalibration; } public boolean hasBaseRecalibration() { return baseRecalibration != null; } - public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); } + public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); } /** * Actually run the GATK with the specified walker. @@ -216,7 +218,7 @@ public class GenomeAnalysisEngine { // if the use specified an input BQSR recalibration table then enable on the fly recalibration if (this.getArguments().BQSR_RECAL_FILE != null) - setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE); + setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels); // Determine how the threads should be divided between CPU vs. IO. determineThreadAllocation(); @@ -356,10 +358,6 @@ public class GenomeAnalysisEngine { public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); } public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); } - protected boolean generateExtendedEvents() { - return walker.generateExtendedEvents(); - } - protected boolean includeReadsWithDeletionAtLoci() { return walker.includeReadsWithDeletionAtLoci(); } @@ -613,7 +611,7 @@ public class GenomeAnalysisEngine { */ protected GenomeLocSortedSet loadIntervals( List> argList, IntervalSetRule rule ) { - List allIntervals = new ArrayList(0); + List allIntervals = new ArrayList(); for ( IntervalBinding intervalBinding : argList ) { List intervals = intervalBinding.getIntervals(this); @@ -766,7 +764,6 @@ public class GenomeAnalysisEngine { new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, includeReadsWithDeletionAtLoci(), - generateExtendedEvents(), getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, getWalkerBAQQualityMode(), refReader, diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index db22886ce..dc77df071 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -36,7 +36,6 @@ public class ReadProperties { private final Collection supplementalFilters; private final boolean includeReadsWithDeletionAtLoci; private final boolean useOriginalBaseQualities; - private final boolean generateExtendedEvents; private final BAQ.CalculationMode cmode; private final BAQ.QualityMode qmode; private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired @@ -52,16 +51,9 @@ public class ReadProperties { return includeReadsWithDeletionAtLoci; } - /** - * Return true if the walker wants to see additional piles of "extended" events (indels). An indel is associated, - * by convention, with the reference base immediately preceding the insertion/deletion, and if this flag is set - * to 'true', any locus with an indel associated with it will cause exactly two subsequent calls to walker's map(): first call - * will be made with a "conventional" base pileup, the next call will be made with a pileup of extended (indel/noevent) - * events. - * @return - */ + @Deprecated public boolean generateExtendedEvents() { - return generateExtendedEvents; + return false; } /** @@ -144,9 +136,6 @@ public class ReadProperties { * @param downsamplingMethod Method for downsampling reads at a given locus. * @param exclusionList what safety checks we're willing to let slide * @param supplementalFilters additional filters to dynamically apply. - * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with - * a pile of indel/noevent extended events at every locus with at least one indel associated with it - * (in addition to a "regular" call to map() at this locus performed with base pileup) * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method * will explicitly list reads with deletion over the current reference base; otherwise, only observed * bases will be seen in the pileups, and the deletions will be skipped silently. @@ -163,7 +152,6 @@ public class ReadProperties { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents, BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, @@ -176,7 +164,6 @@ public class ReadProperties { this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; this.supplementalFilters = supplementalFilters; this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; - this.generateExtendedEvents = generateExtendedEvents; this.useOriginalBaseQualities = useOriginalBaseQualities; this.cmode = cmode; this.qmode = qmode; diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 02d211a0c..3a1408d59 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -107,11 +107,6 @@ public class GATKArgumentCollection { @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; - @Deprecated - @Hidden - @Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false) - public ArrayList RODBindings = new ArrayList(); - @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; @@ -198,6 +193,16 @@ public class GATKArgumentCollection { @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration") public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously + /** + * Turns on the base quantization module. It requires a recalibration report (-BQSR). + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using this many levels. + * Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all) + */ + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false) + public int quantizationLevels = -1; + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) public byte defaultBaseQualities = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java index 57416d111..9a847d38e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java @@ -98,6 +98,7 @@ public class AlignmentContext implements HasGenomeLocation { * only base pileup. * @return */ + @Deprecated public ReadBackedExtendedEventPileup getExtendedEventPileup() { if(!hasExtendedEventPileup()) throw new ReviewedStingException("No extended event pileup is present."); @@ -115,6 +116,7 @@ public class AlignmentContext implements HasGenomeLocation { * * @return */ + @Deprecated public boolean hasExtendedEventPileup() { return basePileup instanceof ReadBackedExtendedEventPileup; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index 376064cdb..1290319e2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -191,6 +191,16 @@ public class ReferenceContext { return basesCache; } + /** + * All the bases in the window from the current base forward to the end of the window. + */ + public byte[] getForwardBases() { + final byte[] bases = getBases(); + final int mid = locus.getStart() - window.getStart(); + // todo -- warning of performance problem, especially if this is called over and over + return new String(bases).substring(mid).getBytes(); + } + @Deprecated public char getBaseAsChar() { return (char)getBase(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java index 4005f1c32..87b356fce 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java @@ -28,6 +28,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.List; import java.util.NoSuchElementException; @@ -154,8 +155,8 @@ class IntervalOverlapFilteringIterator implements CloseableIterator { } } else { - // Found an unmapped read. We're done. - if(candidateRead.getReadUnmappedFlag()) { + // Found a -L UNMAPPED read. NOTE: this is different than just being flagged as unmapped! We're done. + if(AlignmentUtils.isReadGenomeLocUnmapped(candidateRead)) { nextRead = candidateRead; break; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index bbcbe6dbc..7f8c35c96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -167,7 +167,6 @@ public class SAMDataSource { null, new ValidationExclusion(), new ArrayList(), - false, false); } @@ -185,8 +184,7 @@ public class SAMDataSource { DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, - boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents) { + boolean includeReadsWithDeletionAtLoci) { this( samFiles, threadAllocation, numFileHandles, @@ -198,7 +196,6 @@ public class SAMDataSource { exclusionList, supplementalFilters, includeReadsWithDeletionAtLoci, - generateExtendedEvents, BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ @@ -215,9 +212,6 @@ public class SAMDataSource { * @param downsamplingMethod Method for downsampling reads at a given locus. * @param exclusionList what safety checks we're willing to let slide * @param supplementalFilters additional filters to dynamically apply. - * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with - * a pile of indel/noevent extended events at every locus with at least one indel associated with it - * (in addition to a "regular" call to map() at this locus performed with base pileup) * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method * will explicitly list reads with deletion over the current reference base; otherwise, only observed * bases will be seen in the pileups, and the deletions will be skipped silently. @@ -235,7 +229,6 @@ public class SAMDataSource { ValidationExclusion exclusionList, Collection supplementalFilters, boolean includeReadsWithDeletionAtLoci, - boolean generateExtendedEvents, BAQ.CalculationMode cmode, BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, @@ -308,7 +301,6 @@ public class SAMDataSource { exclusionList, supplementalFilters, includeReadsWithDeletionAtLoci, - generateExtendedEvents, cmode, qmode, refReader, diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java index 0987c5d74..6a9642d97 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java @@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter { public boolean filterOut(final SAMRecord rec) { Cigar c = rec.getCigar(); - boolean lastElementWasIndel = false; - for ( CigarElement ce : c.getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) { - if ( lastElementWasIndel ) - return true; - lastElementWasIndel = true; - } else { - lastElementWasIndel = false; + boolean previousElementWasIndel = false; + CigarOperator lastOp = c.getCigarElement(0).getOperator(); + + if (lastOp == CigarOperator.D) // filter out reads starting with deletion + return true; + + for (CigarElement ce : c.getCigarElements()) { + CigarOperator op = ce.getOperator(); + if (op == CigarOperator.D || op == CigarOperator.I) { + if (previousElementWasIndel) + return true; // filter out reads with adjacent I/D + + previousElementWasIndel = true; } + else // this is a regular base (match/mismatch/hard or soft clip) + previousElementWasIndel = false; // reset the previous element + + lastOp = op; } - return false; + return lastOp == CigarOperator.D; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 82cb43634..94051cc7f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,7 +12,6 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -99,8 +98,13 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Create a new stub given the requested file. + * + * @param engine engine. * @param genotypeFile file to (ultimately) create. * @param isCompressed should we compress the output stream? + * @param argumentSources sources. + * @param skipWritingHeader skip writing header. + * @param doNotWriteGenotypes do not write genotypes. */ public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) { this.engine = engine; @@ -114,8 +118,13 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Create a new stub given the requested file. + * + * @param engine engine. * @param genotypeStream stream to (ultimately) write. * @param isCompressed should we compress the output stream? + * @param argumentSources sources. + * @param skipWritingHeader skip writing header. + * @param doNotWriteGenotypes do not write genotypes. */ public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) { this.engine = engine; @@ -154,7 +163,7 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Gets the master sequence dictionary from the engine associated with this stub * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return + * @return the master sequence dictionary from the engine associated with this stub */ public SAMSequenceDictionary getMasterSequenceDictionary() { return engine.getMasterSequenceDictionary(); @@ -188,22 +197,25 @@ public class VCFWriterStub implements Stub, VCFWriter { vcfHeader = header; // Check for the command-line argument header line. If not present, add it in. - if ( !skipWritingHeader ) { - VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); - boolean foundCommandLineHeaderLine = false; - for (VCFHeaderLine line: vcfHeader.getMetaData()) { - if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) - foundCommandLineHeaderLine = true; + if (!skipWritingHeader && header.isWriteEngineHeaders()) { + + if (header.isWriteCommandLine()) { + VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); + boolean foundCommandLineHeaderLine = false; + for (VCFHeaderLine line: vcfHeader.getMetaData()) { + if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) + foundCommandLineHeaderLine = true; + } + if ( !foundCommandLineHeaderLine ) + vcfHeader.addMetaDataLine(commandLineArgHeaderLine); } - if ( !foundCommandLineHeaderLine ) - vcfHeader.addMetaDataLine(commandLineArgHeaderLine); // also put in the reference contig header lines String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName()); for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() ) vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly)); - vcfHeader.addMetaDataLine(new VCFHeaderLine("reference", "file://" + engine.getArguments().referenceFile.getAbsolutePath())); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath())); } outputTracker.getStorage(this).writeHeader(vcfHeader); @@ -225,7 +237,7 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * Gets a string representation of this object. - * @return + * @return a string representation of this object. */ @Override public String toString() { @@ -247,20 +259,20 @@ public class VCFWriterStub implements Stub, VCFWriter { val = String.format("", contig.getSequenceName(), contig.getSequenceLength(), assembly); else val = String.format("", contig.getSequenceName(), contig.getSequenceLength()); - return new VCFHeaderLine("contig", val); + return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val); } private String getReferenceAssembly(String refPath) { // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot String assembly = null; - if ( refPath.indexOf("b37") != -1 || refPath.indexOf("v37") != -1 ) + if (refPath.contains("b37") || refPath.contains("v37")) assembly = "b37"; - else if ( refPath.indexOf("b36") != -1 ) + else if (refPath.contains("b36")) assembly = "b36"; - else if ( refPath.indexOf("hg18") != -1 ) + else if (refPath.contains("hg18")) assembly = "hg18"; - else if ( refPath.indexOf("hg19") != -1 ) + else if (refPath.contains("hg19")) assembly = "hg19"; return assembly; } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index a47c61d0b..8b9674353 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -179,6 +179,11 @@ public class LocusIteratorByState extends LocusIterator { return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); } + public CigarElement peekBackwardOnGenome() { + return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); + } + + public CigarOperator stepForwardOnGenome() { // we enter this method with readOffset = index of the last processed base on the read // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion @@ -194,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator { return stepForwardOnGenome(); } else { if (curElement != null && curElement.getOperator() == CigarOperator.D) - throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString()); + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); // Reads that contain indels model the genomeOffset as the following base in the reference. Because // we fall into this else block only when indels end the read, increment genomeOffset such that the @@ -231,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator { // we see insertions only once, when we step right onto them; the position on the read is scrolled // past the insertion right after that if (eventDelayedFlag > 1) - throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString())); insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength()); eventLength = curElement.getLength(); eventStart = readOffset; @@ -244,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator { break; case D: // deletion w.r.t. the reference if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string - throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString()); + throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); if (generateExtendedEvents) { if (cigarElementCounter == 1) { // generate an extended event only if we just stepped into the deletion (i.e. don't // generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!) if (eventDelayedFlag > 1) - throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); + throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString())); eventLength = curElement.getLength(); eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only eventStart = readOffset; @@ -401,24 +406,24 @@ public class LocusIteratorByState extends LocusIterator { while (iterator.hasNext()) { final SAMRecordState state = iterator.next(); - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read - final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator - final int readOffset = state.getReadOffset(); // the base offset on this read - final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. final int eventLength = state.getEventLength(); - if (op == CigarOperator.N) // N's are never added to any pileup + if (op == CigarOperator.N) // N's are never added to any pileup continue; - if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref + if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref size++; ExtendedEventPileupElement pileupElement; - if (state.getEventBases() == null) { // Deletion event + if (state.getEventBases() == null) { // Deletion event nDeletions++; maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength()); pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength); } - else { // Insertion event + else { // Insertion event nInsertions++; pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases()); } @@ -442,10 +447,10 @@ public class LocusIteratorByState extends LocusIterator { if (indelPile.size() != 0) fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); } - hasExtendedEvents = false; // we are done with extended events prior to current ref base + hasExtendedEvents = false; // we are done with extended events prior to current ref base nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); } - else { // this is a regular event pileup (not extended) + else { // this is a regular event pileup (not extended) GenomeLoc location = getLocation(); Map fullPileup = new HashMap(); boolean hasBeenSampled = false; @@ -454,27 +459,34 @@ public class LocusIteratorByState extends LocusIterator { List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); - size = 0; // number of elements in this sample's pileup - nDeletions = 0; // number of deletions in this sample's pileup - nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) + size = 0; // number of elements in this sample's pileup + nDeletions = 0; // number of deletions in this sample's pileup + nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) while (iterator.hasNext()) { - final SAMRecordState state = iterator.next(); // state object with the read/offset information - final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read - final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator - final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element - final CigarOperator nextOp = nextElement.getOperator(); - final int readOffset = state.getReadOffset(); // the base offset on this read - + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator + final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator + final int readOffset = state.getReadOffset(); // the base offset on this read + + final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; + final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; + final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION; + final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); + int nextElementLength = nextElement.getLength(); - if (op == CigarOperator.N) // N's are never added to any pileup + if (op == CigarOperator.N) // N's are never added to any pileup continue; if (op == CigarOperator.D) { - if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so - pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()), - null,nextOp == CigarOperator.D? nextElementLength:-1)); + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); size++; nDeletions++; if (read.getMappingQuality() == 0) @@ -484,11 +496,10 @@ public class LocusIteratorByState extends LocusIterator { else { if (!filterBaseInRead(read, location.getStart())) { String insertedBaseString = null; - if (nextOp == CigarOperator.I) { + if (nextOp == CigarOperator.I) insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength())); - } - pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()), - insertedBaseString,nextElementLength)); + + pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); size++; if (read.getMappingQuality() == 0) nMQ0Reads++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 286e22369..2c2ee51bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -47,6 +47,14 @@ public class RefMetaDataTracker { // // ------------------------------------------------------------------------------------------ + /** + * Only for testing -- not accesssible in any other context + */ + public RefMetaDataTracker() { + ref = null; + map = Collections.emptyMap(); + } + public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { this.ref = ref; @@ -418,7 +426,7 @@ public class RefMetaDataTracker { * with the current site as a RODRecordList List object. If no data track with specified name is available, * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and - * location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution, + * location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution, * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, * regardless of the presence of "extended" RODs overlapping with that location). diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java index fcd85fd1d..55dd50334 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -132,7 +132,7 @@ public class FeatureManager { } /** - * Return the FeatureDescriptor with getName().equals(name) + * Return the FeatureDescriptor with getID().equals(name) * * @param name * @return A FeatureDescriptor or null if none is found diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 608b5d1d0..f2291e5ec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -1,8 +1,32 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.report; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.*; import java.util.Collection; @@ -13,8 +37,12 @@ import java.util.TreeMap; * Container class for GATK report tables */ public class GATKReport { - public static final String GATKREPORT_HEADER_PREFIX = "##:GATKReport.v"; - private TreeMap tables = new TreeMap(); + public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport."; + public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_0; + private static final String SEPARATOR = ":"; + private GATKReportVersion version = LATEST_REPORT_VERSION; + + private final TreeMap tables = new TreeMap(); /** * Create a new, empty GATKReport. @@ -24,7 +52,8 @@ public class GATKReport { /** * Create a new GATKReport with the contents of a GATKReport on disk. - * @param filename the path to the file to load + * + * @param filename the path to the file to load */ public GATKReport(String filename) { this(new File(filename)); @@ -32,114 +61,96 @@ public class GATKReport { /** * Create a new GATKReport with the contents of a GATKReport on disk. - * @param file the file to load + * + * @param file the file to load */ public GATKReport(File file) { loadReport(file); } /** - * Load a GATKReport file from disk - * @param file the file to load + * Create a new GATK report from GATK report tables + * @param tables Any number of tables that you want ot add to the report */ - private void loadReport(File file) { - try { - BufferedReader reader = new BufferedReader(new FileReader(file)); - - GATKReportTable table = null; - String[] header = null; - int id = 0; - GATKReportVersion version = null; - List columnStarts = null; - - String line; - while ( (line = reader.readLine()) != null ) { - - if (line.startsWith(GATKREPORT_HEADER_PREFIX)) { - - version = GATKReportVersion.fromHeader(line); - - line = line.replaceFirst("##:GATKReport." + version.versionString + " ", ""); - String[] pieces = line.split(" : "); - - String tableName = pieces[0]; - String tableDesc = pieces[1]; - - addTable(tableName, tableDesc); - table = getTable(tableName); - table.setVersion(version); - - header = null; - columnStarts = null; - } else if ( line.trim().isEmpty() ) { - // do nothing - } else { - if (table != null) { - - String[] splitLine; - - switch (version) { - case V0_1: - splitLine = TextFormattingUtils.splitWhiteSpace(line); - break; - - case V0_2: - if (header == null) { - columnStarts = TextFormattingUtils.getWordStarts(line); - } - splitLine = TextFormattingUtils.splitFixedWidth(line, columnStarts); - break; - - default: - throw new ReviewedStingException("GATK report version parsing not implemented for: " + line); - } - - if (header == null) { - header = splitLine; - - table.addPrimaryKey("id", false); - - for ( String columnName : header ) { - table.addColumn(columnName, ""); - } - - id = 0; - } else { - for (int columnIndex = 0; columnIndex < header.length; columnIndex++) { - table.set(id, header[columnIndex], splitLine[columnIndex]); - } - - id++; - } - } - } - } - } catch (FileNotFoundException e) { - throw new StingException("Cannot read GATKReport: " + e); - } catch (IOException e) { - throw new StingException("Cannot read GATKReport: " + e); - } + public GATKReport(GATKReportTable... tables) { + for( GATKReportTable table: tables) + addTable(table); } /** - * Add a new table to the collection + * Load a GATKReport file from disk * - * @param tableName the name of the table - * @param tableDescription the description of the table + * @param file the file to load + */ + private void loadReport(File file) { + BufferedReader reader; + String reportHeader; + try { + reader = new BufferedReader(new FileReader(file)); + reportHeader = reader.readLine(); + } catch (FileNotFoundException e) { + throw new ReviewedStingException("Could not open file : " + file); + } catch (IOException e) { + throw new ReviewedStingException("Could not read file : " + file); + } + + + // Read the first line for the version and number of tables. + version = GATKReportVersion.fromHeader(reportHeader); + if (version.equals(GATKReportVersion.V0_1) || + version.equals(GATKReportVersion.V0_2)) + throw new UserException("The GATK no longer supports reading legacy GATK Reports. Please use v1.0 or newer."); + + int nTables = Integer.parseInt(reportHeader.split(":")[2]); + + // Read each tables according ot the number of tables + for (int i = 0; i < nTables; i++) { + addTable(new GATKReportTable(reader, version)); + } + + + } + + /** + * Add a new, empty table to the report + * + * @param tableName the name of the table + * @param tableDescription the description of the table */ public void addTable(String tableName, String tableDescription) { addTable(tableName, tableDescription, true); } + /** + * Add a new, empty table to the report + * + * @param tableName the name of the table + * @param tableDescription the description of the table + * @param sortByPrimaryKey whether to sort the rows by the primary key + */ public void addTable(String tableName, String tableDescription, boolean sortByPrimaryKey) { GATKReportTable table = new GATKReportTable(tableName, tableDescription, sortByPrimaryKey); tables.put(tableName, table); } + /** + * Adds a table, empty or populated, to the report + * + * @param table the table to add + */ + public void addTable(GATKReportTable table) { + tables.put(table.getTableName(), table); + } + + public void addTables(List gatkReportTables) { + for (GATKReportTable table : gatkReportTables) + addTable(table); + } + /** * Return true if table with a given name exists * - * @param tableName the name of the table + * @param tableName the name of the table * @return true if the table exists, false otherwise */ public boolean hasTable(String tableName) { @@ -149,8 +160,8 @@ public class GATKReport { /** * Return a table with a given name * - * @param tableName the name of the table - * @return the table object + * @param tableName the name of the table + * @return the table object */ public GATKReportTable getTable(String tableName) { GATKReportTable table = tables.get(tableName); @@ -162,17 +173,164 @@ public class GATKReport { /** * Print all tables contained within this container to a PrintStream * - * @param out the PrintStream to which the tables should be written + * @param out the PrintStream to which the tables should be written */ public void print(PrintStream out) { - for (GATKReportTable table : tables.values()) { - if (table.getNumRows() > 0) { - table.write(out); - } - } + out.println(GATKREPORT_HEADER_PREFIX + getVersion().toString() + SEPARATOR + getTables().size()); + for (GATKReportTable table : tables.values()) + table.write(out); } public Collection getTables() { return tables.values(); } + + /** + * This is the main function is charge of gathering the reports. It checks that the reports are compatible and then + * calls the table atheirng functions. + * + * @param input another GATKReport of the same format + */ + public void combineWith(GATKReport input) { + + if (!this.isSameFormat(input)) { + throw new ReviewedStingException("Failed to combine GATKReport, format doesn't match!"); + } + + for (String tableName : input.tables.keySet()) { + tables.get(tableName).combineWith(input.getTable(tableName)); + } + + } + + public GATKReportVersion getVersion() { + return version; + } + + /** + * Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything + * in between. This does not check if the data inside is the same. This is the check to see if the two reports are + * gatherable or reduceable. + * + * @param report another GATK report + * @return true if the the reports are gatherable + */ + public boolean isSameFormat(GATKReport report) { + if (!version.equals(report.version)) { + return false; + } + if (!tables.keySet().equals(report.tables.keySet())) { + return false; + } + for (String tableName : tables.keySet()) { + if (!getTable(tableName).isSameFormat(report.getTable(tableName))) + return false; + } + return true; + } + + /** + * Checks that the reports are exactly the same. + * + * @param report another GATK report + * @return true if all field in the reports, tables, and columns are equal. + */ + public boolean equals(GATKReport report) { + if (!version.equals(report.version)) { + return false; + } + if (!tables.keySet().equals(report.tables.keySet())) { + return false; + } + for (String tableName : tables.keySet()) { + if (!getTable(tableName).equals(report.getTable(tableName))) + return false; + } + return true; + } + + /** + * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need + * the advanced functionality of a full GATK Report. + *

+ * A simple GATK Report consists of: + *

+ * - A single table + * - No primary key ( it is hidden ) + *

+ * Optional: + * - Only untyped columns. As long as the data is an Object, it will be accepted. + * - Default column values being empty strings. + *

+ * Limitations: + *

+ * - A simple GATK report cannot contain multiple tables. + * - It cannot contain typed columns, which prevents arithmetic gathering. + * + * @param tableName The name of your simple GATK report table + * @param columns The names of the columns in your table + * @return a simplified GATK report + */ + public static GATKReport newSimpleReport(String tableName, String... columns) { + GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report"); + table.addPrimaryKey("id", false); + + for (String column : columns) { + table.addColumn(column, ""); + } + + GATKReport output = new GATKReport(); + output.addTable(table); + + return output; + } + + /** + * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports + * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. + * + * @param values the row of data to be added to the table. + * Note: the number of arguments must match the columns in the table. + */ + public void addRow(Object... values) { + // Must be a simplified GATK Report + if (isSimpleReport()) { + + GATKReportTable table = tables.firstEntry().getValue(); + if (table.getColumns().size() != values.length) { + throw new StingException("The number of arguments in addRow() must match the number of columns in the table"); + } + + int counter = table.getNumRows() + 1; + int i = 0; + + for (String columnName : table.getColumns().keySet()) { + table.set(counter, columnName, values[i]); + i++; + } + + } else { + throw new StingException("Cannot add a Row to a non-Simplified GATK Report"); + } + + + } + + /** + * Checks if the GATK report qualifies as a "simple" GATK report + * + * @return true is the report is a simplified GATK report + */ + private boolean isSimpleReport() { + if (tables.size() != 1) + return false; + + GATKReportTable table = tables.firstEntry().getValue(); + + if (!table.getPrimaryKeyName().equals("id")) + return false; + + return true; + + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 5a6490afe..1e798143a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -1,38 +1,78 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.report; import org.apache.commons.lang.math.NumberUtils; -import java.util.*; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedHashMap; /** * Holds values for a column in a GATK report table */ -public class GATKReportColumn extends TreeMap { +public class GATKReportColumn extends LinkedHashMap { final private String columnName; final private Object defaultValue; final private String format; final private boolean display; + final private GATKReportDataType dataType; + + private GATKReportColumnFormat columnFormat; + private GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; // default alignment is to the right unless values added ask for a left alignment + private int maxWidth = 0; /** - * Construct the column object, specifying the column name, default value, and whether or not the column should be displayed + * Construct the column object, specifying the column name, default value, whether or not the column should be + * displayed, and the format string. This cannot be null. * - * @param columnName the name of the column - * @param defaultValue the default value of the column - * @param display if true, the column will be displayed in the final output - * @param format format string + * @param columnName the name of the column + * @param defaultValue the default value of the column + * @param display if true, the column will be displayed in the final output + * @param format format string */ public GATKReportColumn(String columnName, Object defaultValue, boolean display, String format) { this.columnName = columnName; - this.defaultValue = defaultValue; + this.maxWidth = columnName.length(); this.display = display; - this.format = format == null ? null : (format.equals("") ? null : format); + if ( format.equals("") ) { + this.format = "%s"; + this.dataType = GATKReportDataType.Unknown; + this.defaultValue = (defaultValue != null) ? defaultValue : ""; + } + else { + this.format = format; + this.dataType = GATKReportDataType.fromFormatString(format); + this.defaultValue = (defaultValue != null) ? defaultValue : dataType.getDefaultValue(); + } } - /** * Initialize an element in the column with a default value * - * @param primaryKey the primary key position in the column that should be set + * @param primaryKey the primary key position in the column that should be set */ public void initialize(Object primaryKey) { this.put(primaryKey, defaultValue); @@ -40,11 +80,12 @@ public class GATKReportColumn extends TreeMap { /** * Return an object from the column, but if it doesn't exist, return the default value. This is useful when writing - * tables, as the table gets written properly without having to waste storage for the unset elements (usually the zero + * tables, as the table gets written properly without having to waste storage for the unset elements (usually the + * zero * values) in the table. * - * @param primaryKey the primary key position in the column that should be retrieved - * @return the value at the specified position in the column, or the default value if the element is not set + * @param primaryKey the primary key position in the column that should be retrieved + * @return the value at the specified position in the column, or the default value if the element is not set */ private Object getWithoutSideEffects(Object primaryKey) { if (!this.containsKey(primaryKey)) { @@ -57,8 +98,8 @@ public class GATKReportColumn extends TreeMap { /** * Return an object from the column, but if it doesn't exist, return the default value. * - * @param primaryKey the primary key position in the column that should be retrieved - * @return the string value at the specified position in the column, or the default value if the element is not set + * @param primaryKey the primary key position in the column that should be retrieved + * @return the string value at the specified position in the column, or the default value if the element is not set */ public String getStringValue(Object primaryKey) { return formatValue(getWithoutSideEffects(primaryKey)); @@ -68,38 +109,24 @@ public class GATKReportColumn extends TreeMap { * Return the displayable property of the column. If true, the column will be displayed in the final output. * If not, printing will be suppressed for the contents of the table. * - * @return true if the column will be displayed, false if otherwise + * @return true if the column will be displayed, false if otherwise */ public boolean isDisplayable() { return display; } /** - * Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed width. + * Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed + * width. + * * @return the format string for this column */ public GATKReportColumnFormat getColumnFormat() { - int maxWidth = columnName.length(); - GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; + if (columnFormat != null) + return columnFormat; - for (Object obj : this.values()) { - if (obj != null) { - String formatted = formatValue(obj); - - int width = formatted.length(); - if (width > maxWidth) { - maxWidth = width; - } - - if (alignment == GATKReportColumnFormat.Alignment.RIGHT) { - if (!isRightAlign(formatted)) { - alignment = GATKReportColumnFormat.Alignment.LEFT; - } - } - } - } - - return new GATKReportColumnFormat(maxWidth, alignment); + columnFormat = new GATKReportColumnFormat(maxWidth, alignment); + return columnFormat; } private static final Collection RIGHT_ALIGN_STRINGS = Arrays.asList( @@ -112,15 +139,17 @@ public class GATKReportColumn extends TreeMap { /** * Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes * the spaces mean that the value is already padded. + * * @param value to check * @return true if the value is a right alignable */ protected static boolean isRightAlign(String value) { - return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value); + return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim()); } /** * Returns a string version of the values. + * * @param obj The object to convert to a string * @return The string representation of the column */ @@ -128,19 +157,76 @@ public class GATKReportColumn extends TreeMap { String value; if (obj == null) { value = "null"; - } else if ( format != null ) { - value = String.format(format, obj); - } else if (obj instanceof Float) { - value = String.format("%.8f", (Float) obj); - } else if (obj instanceof Double) { - value = String.format("%.8f", (Double) obj); - } else { - value = obj.toString(); } + else if ( dataType.equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) { + value = String.format("%.8f", obj); + } + else + value = String.format(format, obj); + return value; } + public GATKReportDataType getDataType() { + return dataType; + } + + public boolean isSameFormat(GATKReportColumn that) { + return (dataType.equals(that.dataType) && + columnName.equals(that.columnName) && + display == that.display && + format.equals(that.format) && + defaultValue.equals(that.defaultValue) ); + } + + boolean equals(GATKReportColumn that) { + if ( !this.keySet().equals(that.keySet()) ) { + return false; + } + + for (Object key : keySet()) { + Object ValueA = this.get(key); + Object ValueB = that.get(key); + + //if the value is not equal, (use data type to get the right comparison) + if (!dataType.isEqual(ValueA, ValueB)) { + return false; + } + } + + return true; + } + public String getColumnName() { return columnName; } + + public String getFormat() { + if ( dataType.equals(GATKReportDataType.Unknown) ) { + return ""; + } + else + return format; + } + + @Override + public Object put(Object key, Object value) { + if (value != null) { + String formatted = formatValue(value); + if (!formatted.equals("")) { + updateMaxWidth(formatted); + updateFormat(formatted); + } + } + return super.put(key, value); + } + + private void updateMaxWidth(String formatted) { + maxWidth = Math.max(formatted.length(), maxWidth); + } + + private void updateFormat(String formatted) { + if (alignment == GATKReportColumnFormat.Alignment.RIGHT) + alignment = isRightAlign(formatted) ? GATKReportColumnFormat.Alignment.RIGHT : GATKReportColumnFormat.Alignment.LEFT; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java index 6d19a83aa..79ae9b8bd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumnFormat.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -29,8 +29,8 @@ package org.broadinstitute.sting.gatk.report; */ public class GATKReportColumnFormat { public static enum Alignment { LEFT, RIGHT } - public int width; - public Alignment alignment; + private final int width; + private final Alignment alignment; public GATKReportColumnFormat(int width, Alignment alignment) { this.width = width; diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java index a73123b6c..bb6e3a4f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.report; -import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -32,10 +32,11 @@ import java.util.*; * Tracks a linked list of GATKReportColumn in order by name. */ public class GATKReportColumns extends LinkedHashMap implements Iterable { - private List columnNames = new ArrayList(); + private final List columnNames = new ArrayList(); /** * Returns the column by index + * * @param i the index * @return The column */ @@ -44,9 +45,12 @@ public class GATKReportColumns extends LinkedHashMap i } @Override - public GATKReportColumn remove(Object key) { - columnNames.remove(key); - return super.remove(key); + public GATKReportColumn remove(Object columnName) { + if ( !(columnName instanceof String) ) { + throw new ReviewedStingException("The column name must be a String!"); + } + columnNames.remove(columnName.toString()); + return super.remove(columnName); } @Override @@ -59,9 +63,44 @@ public class GATKReportColumns extends LinkedHashMap i public Iterator iterator() { return new Iterator() { int offset = 0; - public boolean hasNext() { return offset < columnNames.size() ; } - public GATKReportColumn next() { return getByIndex(offset++); } - public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); } + + public boolean hasNext() { + return offset < columnNames.size(); + } + + public GATKReportColumn next() { + return getByIndex(offset++); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); + } }; } + + public boolean isSameFormat(GATKReportColumns that) { + if (!columnNames.equals(that.columnNames)) { + return false; + } + for (String columnName : columnNames) { + if (!this.get(columnName).isSameFormat(that.get(columnName))) { + return false; + } + } + return true; + } + + boolean equals(GATKReportColumns that) { + for (Map.Entry pair : entrySet()) { + // Make sure that every column is the same, we know that the # of columns + // is the same from isSameFormat() + String key = pair.getKey(); + + if (!get(key).equals(that.get(key))) { + return false; + } + } + + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java new file mode 100644 index 000000000..6451c5836 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportDataType.java @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; + +/** + * The gatherable data types acceptable in a GATK report column. + */ +public enum GATKReportDataType { + /** + * The null type should not be used. + */ + Null("Null"), + + /** + * The default value when a format string is not present + */ + Unknown("Unknown"), + + /** + * Used for boolean values. Will display as true or false in the table. + */ + Boolean("%[Bb]"), + + /** + * Used for char values. Will display as a char so use printable values! + */ + Character("%[Cc]"), + + /** + * Used for float and double values. Will output a decimal with format %.8f unless otherwise specified. + */ + Decimal("%.*[EeFf]"), + + /** + * Used for int, byte, short, and long values. Will display the full number by default. + */ + Integer("%[Dd]"), + + /** + * Used for string values. Displays the string itself. + */ + String("%[Ss]"); + + private final String dataTypeString; + + private GATKReportDataType(String dataTypeString) { + this.dataTypeString = dataTypeString; + } + + private static final Map lookup = new HashMap(); + + static { + for (GATKReportDataType s : EnumSet.allOf(GATKReportDataType.class)) + lookup.put(s.dataTypeString, s); + } + + + @Override + public String toString() { + return this.dataTypeString; + } + + /** + * Returns a GATK report data type from the Object specified. It looks through the list of acceptable classes and + * returns the appropriate data type. + * + * @param object the object ot derive the data type from + * @return the appropriate data type + */ + public static GATKReportDataType fromObject(Object object) { + GATKReportDataType value; + if (object instanceof Boolean) { + value = GATKReportDataType.Boolean; + + } else if (object instanceof Character) { + value = GATKReportDataType.Character; + + } else if (object instanceof Float || + object instanceof Double) { + value = GATKReportDataType.Decimal; + + } else if (object instanceof Integer || + object instanceof Long || + object instanceof Short || + object instanceof Byte ) { + value = GATKReportDataType.Integer; + + } else if (object instanceof String) { + value = GATKReportDataType.String; + + } else { + value = GATKReportDataType.Unknown; + //throw new UserException("GATKReport could not convert the data object into a GATKReportDataType. Acceptable data objects are found in the documentation."); + } + return value; + } + + /** + * Returns a GATK report data type from the format string specified. It uses regex matching from the enumerated + * Strings. + * + * @param format the format string to derive the data type from + * @return the appropriate data type + */ + public static GATKReportDataType fromFormatString(String format) { + if (format.equals("")) + return Unknown; + for (GATKReportDataType type : lookup.values()) { + if (format.matches(type.toString()) ) + return type; + } + return Unknown; + } + + /** + * Returns the default value of the data type. It returns an object that matches the class of the data type. + * + * @return an object that matches the data type + */ + public Object getDefaultValue() { + switch (this) { + case Decimal: + return 0.0D; + case Boolean: + return false; + case Character: + return '0'; + case Integer: + return 0L; + case String: + return ""; + default: + return null; + } + } + + /** + * Checks if the two objects are equal using the appropriate test form the data types. + * + * @param a an object + * @param b another object to check if equal + * @return true - the objects are equal, false - the objects are nto equal + */ + public boolean isEqual(Object a, Object b) { + switch (this) { + case Null: + return true; + case Decimal: + case Boolean: + case Integer: + return a.toString().equals(b.toString()); + case Character: + case String: + default: + return a.equals(b); + } + } + + /** + * Converts an input String to the appropriate type using the data type. Used for parsing loading a GATK report from + * file. + * + * @param obj The input string + * @return an object that matches the data type. + */ + Object Parse(Object obj) { + if (obj instanceof String) { + String str = obj.toString(); + switch (this) { + case Decimal: + return Double.parseDouble(str); + case Boolean: + return java.lang.Boolean.parseBoolean(str); + case Integer: + return Long.parseLong(str); + case String: + return str; + case Character: + return str.toCharArray()[0]; + default: + return str; + } + } else + return null; + } + + /** + * Returns a format string version of the value according to the data type. + * + * @return The printf string representation of the object according to data type. + */ + public String getDefaultFormatString() { + switch (this) { + case Decimal: + return "%.8f"; + case Boolean: + return "%b"; + case Integer: + return "%d"; + case String: + return "%s"; + case Character: + return "%c"; + case Null: + default: + return "%s"; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java new file mode 100644 index 000000000..ff1f9b90c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.List; + +public class GATKReportGatherer extends Gatherer { + @Override + public void gather(List inputs, File output) { + //Combines inputs GATKReport to one output + + PrintStream o; + try { + o = new PrintStream(output); + } catch (FileNotFoundException e) { + throw new UserException("File to be output by CoverageByRG Gather function was not found"); + } + + GATKReport current = new GATKReport(); + boolean isFirst = true; + for (File input : inputs) { + + // If the table is empty + if (isFirst) { + current = new GATKReport(input); + isFirst = false; + } else { + GATKReport toAdd = new GATKReport(input); + current.combineWith(toAdd); + } + } + + current.print(o); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index b59b550e1..6551bf376 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -1,103 +1,52 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.report; import org.apache.commons.lang.ObjectUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import java.io.BufferedReader; +import java.io.IOException; import java.io.PrintStream; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -/** - * A data structure that allows data to be collected over the course of a walker's computation, then have that data - * written to a PrintStream such that it's human-readable, AWK-able, and R-friendly (given that you load it using the - * GATKReport loader module). - * - * The goal of this object is to use the same data structure for both accumulating data during a walker's computation - * and emitting that data to a file for easy analysis in R (or any other program/language that can take in a table of - * results). Thus, all of the infrastructure below is designed simply to make printing the following as easy as - * possible: - * - * ##:GATKReport.v0.1 ErrorRatePerCycle : The error rate per sequenced position in the reads - * cycle errorrate.61PA8.7 qualavg.61PA8.7 - * 0 0.007451835696110506 25.474613284804366 - * 1 0.002362777171937477 29.844949954504095 - * 2 9.087604507451836E-4 32.87590975254731 - * 3 5.452562704471102E-4 34.498999090081895 - * 4 9.087604507451836E-4 35.14831665150137 - * 5 5.452562704471102E-4 36.07223435225619 - * 6 5.452562704471102E-4 36.1217248908297 - * 7 5.452562704471102E-4 36.1910480349345 - * 8 5.452562704471102E-4 36.00345705967977 - * - * Here, we have a GATKReport table - a well-formatted, easy to read representation of some tabular data. Every single - * table has this same GATKReport.v0.1 header, which permits multiple files from different sources to be cat-ed - * together, which makes it very easy to pull tables from different programs into R via a single file. - * - * ------------ - * Definitions: - * - * Table info: - * The first line, structured as - * ##: :
- * - * Table header: - * The second line, specifying a unique name for each column in the table. - * - * The first column mentioned in the table header is the "primary key" column - a column that provides the unique - * identifier for each row in the table. Once this column is created, any element in the table can be referenced by - * the row-column coordinate, i.e. "primary key"-"column name" coordinate. - * - * When a column is added to a table, a default value must be specified (usually 0). This is the initial value for - * an element in a column. This permits operations like increment() and decrement() to work properly on columns that - * are effectively counters for a particular event. - * - * Finally, the display property for each column can be set during column creation. This is useful when a given - * column stores an intermediate result that will be used later on, perhaps to calculate the value of another column. - * In these cases, it's obviously necessary to store the value required for further computation, but it's not - * necessary to actually print the intermediate column. - * - * Table body: - * The values of the table itself. - * - * --------------- - * Implementation: - * - * The implementation of this table has two components: - * 1. A TreeSet that stores all the values ever specified for the primary key. Any get() operation that - * refers to an element where the primary key object does not exist will result in its implicit creation. I - * haven't yet decided if this is a good idea... - * - * 2. A HashMap that stores a mapping from column name to column contents. Each - * GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap) between - * primary key and the column value. This means that, given N columns, the primary key information is stored - * N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations. - * - * ------------------------------ - * Element and column operations: - * - * In addition to simply getting and setting values, this object also permits some simple operations to be applied to - * individual elements or to whole columns. For instance, an element can be easily incremented without the hassle of - * calling get(), incrementing the obtained value by 1, and then calling set() with the new value. Also, some vector - * operations are supported. For instance, two whole columns can be divided and have the result be set to a third - * column. This is especially useful when aggregating counts in two intermediate columns that will eventually need to - * be manipulated row-by-row to compute the final column. - * - * Note: I've made no attempt whatsoever to make these operations efficient. Right now, some of the methods check the - * type of the stored object using an instanceof call and attempt to do the right thing. Others cast the contents of - * the cell to a Number, call the Number.toDouble() method and compute a result. This is clearly not the ideal design, - * but at least the prototype contained herein works. - * - * @author Kiran Garimella - * @author Khalid Shakir - */ public class GATKReportTable { - /** REGEX that matches any table with an invalid name */ - public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]"; - private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2; + /** + * REGEX that matches any table with an invalid name + */ + public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]"; + private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable"; + private static final String SEPARATOR = ":"; + private static final String ENDLINE = ":;"; + private String tableName; private String tableDescription; - private GATKReportVersion version = LATEST_REPORT_VERSION; + private String primaryKeyName; private Collection primaryKeyColumn; @@ -106,11 +55,118 @@ public class GATKReportTable { private GATKReportColumns columns; + private static final String COULD_NOT_READ_HEADER = "Could not read the header of this file -- "; + private static final String COULD_NOT_READ_COLUMN_NAMES = "Could not read the column names of this file -- "; + private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- "; + private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- "; + private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables"; + + private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: "; + + public GATKReportTable(BufferedReader reader, GATKReportVersion version) { + int counter = 0; + + switch (version) { + case V1_0: + int nHeaders = 2; + String[] tableHeaders = new String[nHeaders]; + + // Read in the headers + for (int i = 0; i < nHeaders; i++) { + try { + tableHeaders[i] = reader.readLine(); + } catch (IOException e) { + throw new ReviewedStingException(COULD_NOT_READ_HEADER + e.getMessage()); + } + } + String[] tableData = tableHeaders[0].split(":"); + String[] userData = tableHeaders[1].split(":"); + + // Fill in the fields + tableName = userData[2]; + tableDescription = (userData.length <= 3) ? "" : userData[3]; // table may have no description! (and that's okay) + primaryKeyDisplay = Boolean.parseBoolean(tableData[2]); + columns = new GATKReportColumns(); + + int nColumns = Integer.parseInt(tableData[3]); + int nRows = Integer.parseInt(tableData[4]); + + + // Read column names + String columnLine; + try { + columnLine = reader.readLine(); + } catch (IOException e) { + throw new ReviewedStingException(COULD_NOT_READ_COLUMN_NAMES); + } + + List columnStarts = TextFormattingUtils.getWordStarts(columnLine); + String[] columnNames = TextFormattingUtils.splitFixedWidth(columnLine, columnStarts); + + if (primaryKeyDisplay) { + addPrimaryKey(columnNames[0]); + + } else { + sortByPrimaryKey = true; + addPrimaryKey("id", false); + counter = 1; + } + // Put in columns using the format string from the header + for (int i = 0; i < nColumns; i++) { + String format = tableData[5 + i]; + if (primaryKeyDisplay) + addColumn(columnNames[i + 1], true, format); + else + addColumn(columnNames[i], true, format); + } + + for (int i = 0; i < nRows; i++) { + // read line + String dataLine; + try { + dataLine = reader.readLine(); + } catch (IOException e) { + throw new ReviewedStingException(COULD_NOT_READ_DATA_LINE + e.getMessage()); + } + List lineSplits = Arrays.asList(TextFormattingUtils.splitFixedWidth(dataLine, columnStarts)); + + for (int columnIndex = 0; columnIndex < nColumns; columnIndex++) { + + //Input all the remaining values + GATKReportDataType type = getColumns().getByIndex(columnIndex).getDataType(); + + if (primaryKeyDisplay) { + String columnName = columnNames[columnIndex + 1]; + String primaryKey = lineSplits.get(0); + set(primaryKey, columnName, type.Parse(lineSplits.get(columnIndex + 1))); + } else { + String columnName = columnNames[columnIndex]; + set(counter, columnName, type.Parse(lineSplits.get(columnIndex))); + } + + } + counter++; + } + + + try { + reader.readLine(); + } catch (IOException e) { + throw new ReviewedStingException(COULD_NOT_READ_EMPTY_LINE + e.getMessage()); + } + break; + + default: + throw new ReviewedStingException(OLD_GATK_TABLE_VERSION); + } + } + + /** * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed * - * @param name the name of the table or column - * @return true if the name is valid, false if otherwise + * @param name the name of the table or column + * @return true if the name is valid, false if otherwise */ private boolean isValidName(String name) { Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX); @@ -122,8 +178,8 @@ public class GATKReportTable { /** * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed * - * @param description the name of the table or column - * @return true if the name is valid, false if otherwise + * @param description the name of the table or column + * @return true if the name is valid, false if otherwise */ private boolean isValidDescription(String description) { Pattern p = Pattern.compile("\\r|\\n"); @@ -135,15 +191,23 @@ public class GATKReportTable { /** * Construct a new GATK report table with the specified name and description * - * @param tableName the name of the table - * @param tableDescription the description of the table + * @param tableName the name of the table + * @param tableDescription the description of the table */ public GATKReportTable(String tableName, String tableDescription) { this(tableName, tableDescription, true); } + /** + * Construct a new GATK report table with the specified name and description and whether to sort rows by the primary + * key + * + * @param tableName the name of the table + * @param tableDescription the description of the table + * @param sortByPrimaryKey whether to sort rows by the primary key (instead of order added) + */ public GATKReportTable(String tableName, String tableDescription, boolean sortByPrimaryKey) { - if (!isValidName(tableName)) { + if (!isValidName(tableName)) { throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); } @@ -158,28 +222,21 @@ public class GATKReportTable { columns = new GATKReportColumns(); } - public GATKReportVersion getVersion() { - return version; - } - - protected void setVersion(GATKReportVersion version) { - this.version = version; - } - /** * Add a primary key column. This becomes the unique identifier for every column in the table. * - * @param primaryKeyName the name of the primary key column + * @param primaryKeyName the name of the primary key column */ public void addPrimaryKey(String primaryKeyName) { addPrimaryKey(primaryKeyName, true); } /** - * Add an optionally visible primary key column. This becomes the unique identifier for every column in the table, and will always be printed as the first column. + * Add an optionally visible primary key column. This becomes the unique identifier for every column in the table, + * and will always be printed as the first column. * - * @param primaryKeyName the name of the primary key column - * @param display should this primary key be displayed? + * @param primaryKeyName the name of the primary key column + * @param display should this primary key be displayed? */ public void addPrimaryKey(String primaryKeyName, boolean display) { if (!isValidName(primaryKeyName)) { @@ -193,49 +250,40 @@ public class GATKReportTable { } /** - * Returns the first primary key matching the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * @param dottedColumnValues Period concatenated values. + * Returns the first primary key matching the column values. + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" + * @param columnValues column values. * @return The first primary key matching the column values or throws an exception. */ - public Object getPrimaryKey(String dottedColumnValues) { - Object key = findPrimaryKey(dottedColumnValues); + public Object getPrimaryKeyByData(Object... columnValues) { + Object key = findPrimaryKeyByData(columnValues); if (key == null) - throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues); + throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + Arrays.asList(columnValues)); return key; } - /** - * Returns true if there is at least on row with the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * @param dottedColumnValues Period concatenated values. - * @return true if there is at least one row matching the columns. - */ - public boolean containsPrimaryKey(String dottedColumnValues) { - return findPrimaryKey(dottedColumnValues) != null; - } - - /** - * Returns the first primary key matching the dotted column values. - * Ex: dbsnp.eval.called.all.novel.all - * @param dottedColumnValues Period concatenated values. - * @return The first primary key matching the column values or null. - */ - private Object findPrimaryKey(String dottedColumnValues) { - return findPrimaryKey(dottedColumnValues.split("\\.")); - } - /** * Returns the first primary key matching the column values. - * Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" } + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" + * * @param columnValues column values. - * @return The first primary key matching the column values. + * @return The first primary key matching the column values or null if the key does not exist. */ - private Object findPrimaryKey(Object[] columnValues) { + public Object findPrimaryKeyByData(Object... columnValues) { + if (columnValues == null) + throw new NullPointerException("Column values is null"); + if (columnValues.length == 0) + throw new IllegalArgumentException("Column values is empty"); + int columnCount = columns.size(); for (Object primaryKey : primaryKeyColumn) { boolean matching = true; - for (int i = 0; matching && i < columnValues.length; i++) { - matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1)); + // i --> index into columnValues parameter + // j --> index into columns collection + for (int i = 0, j = 0; matching && i < columnValues.length && j < columnCount; j++) { + if (!columns.getByIndex(j).isDisplayable()) + continue; + matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i)); + i++; } if (matching) return primaryKey; @@ -244,29 +292,65 @@ public class GATKReportTable { } /** - * Add a column to the report and specify the default value that should be supplied if a given position in the table is never explicitly set. + * Add a column to the report and specify the default value that should be supplied if a given position in the table + * is never explicitly set. * - * @param columnName the name of the column - * @param defaultValue the default value for the column + * @param columnName the name of the column + * @param defaultValue the default value for the column */ public void addColumn(String columnName, Object defaultValue) { - addColumn(columnName, defaultValue, null); + addColumn(columnName, defaultValue, true); } + /** + * Add a column to the report, specify the default column value, and specify whether the column should be displayed + * in the final output (useful when intermediate columns are necessary for later calculations, but are not required + * to be in the output file. + * + * @param columnName the name of the column + * @param defaultValue the default value of the column + * @param display if true - the column will be displayed; if false - the column will be hidden + */ + public void addColumn(String columnName, Object defaultValue, boolean display) { + addColumn(columnName, defaultValue, display, ""); + } + + /** + * Add a column to the report, specify the default column value, and specify whether the column should be displayed + * in the final output (useful when intermediate columns are necessary for later calculations, but are not required + * to be in the output file. + * + * @param columnName the name of the column + * @param defaultValue the default value of the column + * @param format the format string used to display data + */ public void addColumn(String columnName, Object defaultValue, String format) { addColumn(columnName, defaultValue, true, format); } + /** - * Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file. + * Add a column to the report, specify whether the column should be displayed in the final output (useful when + * intermediate columns are necessary for later calculations, but are not required to be in the output file), and the + * format string used to display the data. * - * @param columnName the name of the column - * @param defaultValue the default value of the column - * @param display if true - the column will be displayed; if false - the column will be hidden + * @param columnName the name of the column + * @param display if true - the column will be displayed; if false - the column will be hidden + * @param format the format string used to display data */ - public void addColumn(String columnName, Object defaultValue, boolean display) { - addColumn(columnName, defaultValue, display, null); + public void addColumn(String columnName, boolean display, String format) { + addColumn(columnName, null, display, format); } + /** + * Add a column to the report, specify the default column value, whether the column should be displayed in the final + * output (useful when intermediate columns are necessary for later calculations, but are not required to be in the + * output file), and the format string used to display the data. + * + * @param columnName the name of the column + * @param defaultValue if true - the column will be displayed; if false - the column will be hidden + * @param display display the column + * @param format the format string used to display data + */ public void addColumn(String columnName, Object defaultValue, boolean display, String format) { if (!isValidName(columnName)) { throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed."); @@ -277,8 +361,8 @@ public class GATKReportTable { /** * Check if the requested element exists, and if not, create it. * - * @param primaryKey the primary key value - * @param columnName the name of the column + * @param primaryKey the primary key value + * @param columnName the name of the column */ private void verifyEntry(Object primaryKey, String columnName) { if (!columns.containsKey(columnName)) { @@ -303,26 +387,67 @@ public class GATKReportTable { /** * Set the value for a given position in the table * - * @param primaryKey the primary key value - * @param columnName the name of the column - * @param value the value to set + * @param primaryKey the primary key value + * @param columnName the name of the column + * @param value the value to set */ public void set(Object primaryKey, String columnName, Object value) { verifyEntry(primaryKey, columnName); + GATKReportColumn column = columns.get(columnName); + //todo -- Check if value is of same type as column - columns.get(columnName).put(primaryKey, value); + // We do not accept internal null values + if (value == null) + value = "null"; + + // This code below is bs. Why am do I have to conform to bad code + // Below is some code to convert a string into its appropriate type. + + // I second Roger's rant! + + // If we got a string but the column is not a String type + Object newValue = null; + if (value instanceof String && !column.getDataType().equals(GATKReportDataType.String)) { + // Integer case + if (column.getDataType().equals(GATKReportDataType.Integer)) { + try { + newValue = Long.parseLong((String) value); + } catch (Exception e) { + /** do nothing */ + } + } + if (column.getDataType().equals(GATKReportDataType.Decimal)) { + try { + newValue = Double.parseDouble((String) value); + } catch (Exception e) { + /** do nothing */ + } + } + if (column.getDataType().equals(GATKReportDataType.Character) && ((String) value).length() == 1) { + newValue = ((String) value).charAt(0); + } + } + + if (newValue != null) + value = newValue; + + // todo -- Types have to be more flexible. For example, %d should accept Integers, Shorts and Bytes. + if (column.getDataType().equals(GATKReportDataType.fromObject(value)) || column.getDataType().equals(GATKReportDataType.Unknown) ) + columns.get(columnName).put(primaryKey, value); + else + throw new ReviewedStingException(String.format("Tried to add an object of type: %s to a column of type: %s", GATKReportDataType.fromObject(value).name(), column.getDataType().name())); } /** * Get a value from the given position in the table * - * @param primaryKey the primary key value - * @param columnName the name of the column - * @return the value stored at the specified position in the table + * @param primaryKey the primary key value + * @param columnName the name of the column + * @return the value stored at the specified position in the table */ public Object get(Object primaryKey, String columnName) { verifyEntry(primaryKey, columnName); - + return columns.get(columnName).get(primaryKey); } @@ -331,7 +456,7 @@ public class GATKReportTable { * * @param primaryKey the primary key value * @param columnIndex the index of the column - * @return the value stored at the specified position in the table + * @return the value stored at the specified position in the table */ private Object get(Object primaryKey, int columnIndex) { return columns.getByIndex(columnIndex).get(primaryKey); @@ -340,8 +465,8 @@ public class GATKReportTable { /** * Increment an element in the table. This implementation is awful - a functor would probably be better. * - * @param primaryKey the primary key value - * @param columnName the name of the column + * @param primaryKey the primary key value + * @param columnName the name of the column */ public void increment(Object primaryKey, String columnName) { Object oldValue = get(primaryKey, columnName); @@ -369,8 +494,8 @@ public class GATKReportTable { /** * Decrement an element in the table. This implementation is awful - a functor would probably be better. * - * @param primaryKey the primary key value - * @param columnName the name of the column + * @param primaryKey the primary key value + * @param columnName the name of the column */ public void decrement(Object primaryKey, String columnName) { Object oldValue = get(primaryKey, columnName); @@ -398,9 +523,9 @@ public class GATKReportTable { /** * Add the specified value to an element in the table * - * @param primaryKey the primary key value - * @param columnName the name of the column - * @param valueToAdd the value to add + * @param primaryKey the primary key value + * @param columnName the name of the column + * @param valueToAdd the value to add */ public void add(Object primaryKey, String columnName, Object valueToAdd) { Object oldValue = get(primaryKey, columnName); @@ -428,8 +553,8 @@ public class GATKReportTable { /** * Subtract the specified value from an element in the table * - * @param primaryKey the primary key value - * @param columnName the name of the column + * @param primaryKey the primary key value + * @param columnName the name of the column * @param valueToSubtract the value to subtract */ public void subtract(Object primaryKey, String columnName, Object valueToSubtract) { @@ -458,9 +583,9 @@ public class GATKReportTable { /** * Multiply the specified value to an element in the table * - * @param primaryKey the primary key value - * @param columnName the name of the column - * @param valueToMultiply the value to multiply by + * @param primaryKey the primary key value + * @param columnName the name of the column + * @param valueToMultiply the value to multiply by */ public void multiply(Object primaryKey, String columnName, Object valueToMultiply) { Object oldValue = get(primaryKey, columnName); @@ -488,9 +613,9 @@ public class GATKReportTable { /** * Divide the specified value from an element in the table * - * @param primaryKey the primary key value - * @param columnName the name of the column - * @param valueToDivide the value to divide by + * @param primaryKey the primary key value + * @param columnName the name of the column + * @param valueToDivide the value to divide by */ public void divide(Object primaryKey, String columnName, Object valueToDivide) { Object oldValue = get(primaryKey, columnName); @@ -518,9 +643,9 @@ public class GATKReportTable { /** * Add two columns to each other and set the results to a third column * - * @param columnToSet the column that should hold the results - * @param augend the column that shall be the augend - * @param addend the column that shall be the addend + * @param columnToSet the column that should hold the results + * @param augend the column that shall be the augend + * @param addend the column that shall be the addend */ public void addColumns(String columnToSet, String augend, String addend) { for (Object primaryKey : primaryKeyColumn) { @@ -536,8 +661,8 @@ public class GATKReportTable { /** * Subtract one column from another and set the results to a third column * - * @param columnToSet the column that should hold the results - * @param minuend the column that shall be the minuend (the a in a - b) + * @param columnToSet the column that should hold the results + * @param minuend the column that shall be the minuend (the a in a - b) * @param subtrahend the column that shall be the subtrahend (the b in a - b) */ public void subtractColumns(String columnToSet, String minuend, String subtrahend) { @@ -555,8 +680,8 @@ public class GATKReportTable { * Multiply two columns by each other and set the results to a third column * * @param columnToSet the column that should hold the results - * @param multiplier the column that shall be the multiplier - * @param multiplicand the column that shall be the multiplicand + * @param multiplier the column that shall be the multiplier + * @param multiplicand the column that shall be the multiplicand */ public void multiplyColumns(String columnToSet, String multiplier, String multiplicand) { for (Object primaryKey : primaryKeyColumn) { @@ -572,9 +697,9 @@ public class GATKReportTable { /** * Divide two columns by each other and set the results to a third column * - * @param columnToSet the column that should hold the results - * @param numeratorColumn the column that shall be the numerator - * @param denominatorColumn the column that shall be the denominator + * @param columnToSet the column that should hold the results + * @param numeratorColumn the column that shall be the numerator + * @param denominatorColumn the column that shall be the denominator */ public void divideColumns(String columnToSet, String numeratorColumn, String denominatorColumn) { for (Object primaryKey : primaryKeyColumn) { @@ -589,10 +714,11 @@ public class GATKReportTable { /** * Return the print width of the primary key column - * @return the width of the primary key column + * + * @return the width of the primary key column */ - public int getPrimaryKeyColumnWidth() { - int maxWidth = primaryKeyName.length(); + int getPrimaryKeyColumnWidth() { + int maxWidth = getPrimaryKeyName().length(); for (Object primaryKey : primaryKeyColumn) { int width = primaryKey.toString().length(); @@ -608,30 +734,47 @@ public class GATKReportTable { /** * Write the table to the PrintStream, formatted nicely to be human-readable, AWK-able, and R-friendly. * - * @param out the PrintStream to which the table should be written + * @param out the PrintStream to which the table should be written */ - public void write(PrintStream out) { + void write(PrintStream out) { + + /* + * Table header: + * #:GATKTable:nColumns:nRows:(DataType for each column):; + * #:GATKTable:TableName:Description :; + * key colA colB + * row1 xxxx xxxxx + */ + // Get the column widths for everything - HashMap columnFormats = new HashMap(); - for (String columnName : columns.keySet()) { - columnFormats.put(columnName, columns.get(columnName).getColumnFormat()); - } String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; // Emit the table definition - out.printf("##:GATKReport.%s %s : %s%n", LATEST_REPORT_VERSION.versionString, tableName, tableDescription); + String formatHeader = String.format(GATKTABLE_HEADER_PREFIX + ":%b:%d:%d", primaryKeyDisplay, getColumns().size(), getNumRows()); + // Add all the formats for all the columns + for (GATKReportColumn column : getColumns()) { + if (column.isDisplayable()) + formatHeader += (SEPARATOR + column.getFormat()); + } + out.println(formatHeader + ENDLINE); + out.printf(GATKTABLE_HEADER_PREFIX + ":%s:%s\n", tableName, tableDescription); + + //out.printf("#:GATKTable:%s:%s", Algorithm); + // Emit the table header, taking into account the padding requirement if the primary key is a hidden column boolean needsPadding = false; if (primaryKeyDisplay) { - out.printf(primaryKeyFormat, primaryKeyName); + out.printf(primaryKeyFormat, getPrimaryKeyName()); needsPadding = true; } for (String columnName : columns.keySet()) { if (columns.get(columnName).isDisplayable()) { - if (needsPadding) { out.printf(" "); } - out.printf(columnFormats.get(columnName).getNameFormat(), columnName); + if (needsPadding) { + out.printf(" "); + } + out.printf(columns.get(columnName).getColumnFormat().getNameFormat(), columnName); needsPadding = true; } @@ -640,28 +783,31 @@ public class GATKReportTable { out.printf("%n"); // Emit the table body - for (Object primaryKey : primaryKeyColumn) { + for (final Object primaryKey : primaryKeyColumn) { needsPadding = false; if (primaryKeyDisplay) { out.printf(primaryKeyFormat, primaryKey); needsPadding = true; } - for (String columnName : columns.keySet()) { - if (columns.get(columnName).isDisplayable()) { - if (needsPadding) { out.printf(" "); } - String value = columns.get(columnName).getStringValue(primaryKey); - out.printf(columnFormats.get(columnName).getValueFormat(), value); + for (final Map.Entry entry : columns.entrySet()) { + final GATKReportColumn column = entry.getValue(); + if (column.isDisplayable()) { + if (needsPadding) { + out.print(" "); + } + + final String value = column.getStringValue(primaryKey); + out.printf(column.getColumnFormat().getValueFormat(), value); needsPadding = true; } } - out.printf("%n"); + out.println(); } - // Close the table - out.printf("%n"); + out.println(); } public int getNumRows() { @@ -679,4 +825,90 @@ public class GATKReportTable { public GATKReportColumns getColumns() { return columns; } + + /** + * Combines two compatible GATK report tables. This is the general function which will call the different algorithms + * necessary to gather the tables. Every column's combine algorithm is read and treated accordingly. + * + * @param input Another GATK table + */ + void combineWith(GATKReportTable input) { + /* + * This function is different from addRowsFrom because we will add the ability to sum,average, etc rows + * TODO: Add other combining algorithms + */ + + // Make sure the columns match AND the Primary Key + if (input.getColumns().keySet().equals(this.getColumns().keySet()) && + input.getPrimaryKeyName().equals(this.getPrimaryKeyName())) { + this.addRowsFrom(input); + } else + throw new ReviewedStingException("Failed to combine GATKReportTable, columns don't match!"); + } + + /** + * A gather algorithm that simply takes the rows from the argument, and adds them to the current table. This is the + * default gather algorithm. + * + * @param input Another GATK table to add rows from. + */ + private void addRowsFrom(GATKReportTable input) { + // add column by column + + // For every column + for (String columnKey : input.getColumns().keySet()) { + GATKReportColumn current = this.getColumns().get(columnKey); + GATKReportColumn toAdd = input.getColumns().get(columnKey); + // We want to take the current column and add all the values from input + + // The column is a map of values + for (Object rowKey : toAdd.keySet()) { + // We add every value from toAdd to the current + if (!current.containsKey(rowKey)) { + this.set(rowKey, columnKey, toAdd.get(rowKey)); + //System.out.printf("Putting row with PK: %s \n", rowKey); + } else { + this.set(rowKey, columnKey, toAdd.get(rowKey)); + + System.out.printf("OVERWRITING Row with PK: %s \n", rowKey); + } + } + } + + } + + public String getPrimaryKeyName() { + return primaryKeyName; + } + + /** + * Returns whether or not the two tables have the same format including columns and everything in between. This does + * not check if the data inside is the same. This is the check to see if the two tables are gatherable or + * reduceable + * + * @param table another GATK table + * @return true if the the tables are gatherable + */ + public boolean isSameFormat(GATKReportTable table) { + //Should we add the sortByPrimaryKey as a check? + + return columns.isSameFormat(table.columns) && + (primaryKeyDisplay == table.primaryKeyDisplay && primaryKeyName.equals(table.primaryKeyName) && + tableName.equals(table.tableName) && + tableDescription.equals(table.tableDescription)); + } + + /** + * Checks that the tables are exactly the same. + * + * @param table another GATK report + * @return true if all field in the reports, tables, and columns are equal. + */ + public boolean equals(GATKReportTable table) { + return isSameFormat(table) && + (columns.equals(table.columns) && + primaryKeyColumn.equals(table.primaryKeyColumn) && + sortByPrimaryKey == table.sortByPrimaryKey); + + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java index 5f1159a43..99381cc21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -31,7 +31,7 @@ public enum GATKReportVersion { * Differences between other versions: * - Does not allow spaces in cells. * - Mostly fixed width but has a bug where the string width of floating point - * values was not measured correctly leading to columns that aren't aligned + * values was not measured correctly leading to columns that aren't aligned */ V0_1("v0.1"), @@ -40,9 +40,17 @@ public enum GATKReportVersion { * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". * - Fixed width fixed for floating point values */ - V0_2("v0.2"); + V0_2("v0.2"), - public final String versionString; + /* + * Differences between v0.x + * - Added table and report headers + * - Headers changed format, include the numbe rof tables, rows, and metadata for gathering + * - IS GATHERABLE + */ + V1_0("v1.0"); + + private final String versionString; private GATKReportVersion(String versionString) { this.versionString = versionString; @@ -53,8 +61,13 @@ public enum GATKReportVersion { return versionString; } + public boolean equals(GATKReportVersion that) { + return (versionString.equals(that.versionString)); + } + /** * Returns the GATK Report Version from the file header. + * * @param header Header from the file starting with ##:GATKReport.v[version] * @return The version as an enum. */ @@ -65,6 +78,9 @@ public enum GATKReportVersion { if (header.startsWith("##:GATKReport.v0.2 ")) return GATKReportVersion.V0_2; + if (header.startsWith("#:GATKReport.v1.0")) + return GATKReportVersion.V1_0; + throw new ReviewedStingException("Unknown GATK report version in header: " + header); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index a6f6b3481..31149cd8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -235,4 +235,14 @@ public class SampleDB { } return children; } + + public Set getFounderIds(){ + Set founders = new HashSet(); + for(Sample sample : getSamples()){ + if(sample.getParents().size()<1) + founders.add(sample.getID()); + + } + return founders; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 3f24e6585..76c1ce8c5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -42,38 +43,32 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); - GenomeLoc firstIsActiveStart = null; + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while( locusView.hasNext() ) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); + if(prevLoc != null) { - for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) { + // fill in the active / inactive labels from the stop of the previous location to the start of this location + // TODO refactor to separate function + for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { - final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) ); - isActiveList.add( isActiveProb ); - if( firstIsActiveStart == null ) { - firstIsActiveStart = fakeLoc; - } + final double isActiveProb = ( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ); + profile.add(fakeLoc, isActiveProb); } } } @@ -89,12 +84,8 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null ); - logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); - if( walker.activeRegionOutStream == null ) { - workQueue.addAll( activeRegions ); - } else { // Just want to output the active regions to a file, not actually process them - for( final ActiveRegion activeRegion : activeRegions ) { - if( activeRegion.isActive ) { - walker.activeRegionOutStream.println( activeRegion.getLocation() ); - } - } - } + // band-pass filter the list of isActive probabilities and turn into active regions + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); - } + // add active regions to queue of regions to process + workQueue.addAll( activeRegions ); + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); } return sum; } - // Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine - public T endTraversal( final Walker walker, T sum) { + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final double walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0; + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param walker + */ + private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : workQueue ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here while( workQueue.peek() != null ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker) walker ); + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } else { + break; + } } return sum; @@ -193,6 +232,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) { - - final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author - final ArrayList returnList = new ArrayList(); - if( activeList.size() == 0 ) { - return returnList; - } else if( activeList.size() == 1 ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()), - activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) ); - return returnList; - } else { - final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]); - final double[] filteredProbArray = new double[activeProbArray.length]; - final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author - final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - double maxVal = 0; - for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) { - if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } - } - filteredProbArray[iii] = maxVal; - } - - boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD; - int curStart = 0; - for(int iii = 1; iii < filteredProbArray.length; iii++ ) { - final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD; - if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( new ActiveRegion( - engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); - curStatus = thisStatus; - curStart = iii; - } - } - if( curStart != filteredProbArray.length-1 ) { - returnList.add( new ActiveRegion( - engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); - } - return returnList; - } + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java index bb007893c..d27148884 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy; public @interface ActiveRegionExtension { public int extension() default 0; + public int maxRegion() default 1500; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 6403f15a2..f217268d2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -33,8 +30,8 @@ import java.util.List; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) -@ActiveRegionExtension(extension=50) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@ActiveRegionExtension(extension=50,maxRegion=1500) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) public abstract class ActiveRegionWalker extends Walker { @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) @@ -45,6 +42,10 @@ public abstract class ActiveRegionWalker extends Walker implements TreeR @Output PrintStream out; - @Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events") - public boolean SHOW_INDEL_PILEUPS = false; + @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output") + public boolean SHOW_VERBOSE = false; @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); @@ -74,28 +75,18 @@ public class PileupWalker extends LocusWalker implements TreeR public void initialize() { } - public boolean generateExtendedEvents() { return SHOW_INDEL_PILEUPS; } - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { String rods = getReferenceOrderedData( tracker ); if ( context.hasBasePileup() ) { ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s%n", basePileup.getPileupString(ref.getBaseAsChar()), rods); + out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); + if ( SHOW_VERBOSE ) + out.printf(" %s", createVerboseOutput(basePileup)); + out.println(); } - if ( context.hasExtendedEventPileup() ) { - ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - List> eventCounts = indelPileup.getEventStringsWithCounts(ref.getBases()); - - out.printf("%s %s ", indelPileup.getShortPileupString(), rods); - int i = 0; - for ( ; i < eventCounts.size() - 1 ; i++ ) { - out.printf("%s:%d,",eventCounts.get(i).first,eventCounts.get(i).second); - } - out.printf("%s:%d%n",eventCounts.get(i).first,eventCounts.get(i).second); - } return 1; } @@ -126,6 +117,31 @@ public class PileupWalker extends LocusWalker implements TreeR return rodString; } + private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names + + private static String createVerboseOutput(final ReadBackedPileup pileup) { + final StringBuilder sb = new StringBuilder(); + boolean isFirst = true; + + sb.append(pileup.getNumberOfDeletions()); + sb.append(" "); + + for ( PileupElement p : pileup ) { + if ( isFirst ) + isFirst = false; + else + sb.append(","); + sb.append(p.getRead().getReadName()); + sb.append(verboseDelimiter); + sb.append(p.getOffset()); + sb.append(verboseDelimiter); + sb.append(p.getRead().getReadLength()); + sb.append(verboseDelimiter); + sb.append(p.getRead().getMappingQuality()); + } + return sb.toString(); + } + @Override public void onTraversalDone(Integer result) { out.println("[REDUCE RESULT] Traversal result is: " + result); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 0702b08c1..cb2944d31 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -91,7 +90,7 @@ import java.util.TreeSet; */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReadsWalker extends ReadWalker { +public class PrintReadsWalker extends ReadWalker { @Output(doc="Write output to this BAM filename instead of STDOUT") SAMFileWriter out; @@ -129,6 +128,13 @@ public class PrintReadsWalker extends ReadWalker { @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set sampleNames = new TreeSet(); + /** + * Erase all extra attributes in the read but keep the read group information + */ + @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) + public boolean simplifyReads = false; + + private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; @@ -162,7 +168,7 @@ public class PrintReadsWalker extends ReadWalker { * The reads filter function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a SAMRecord + * @param read the read itself, as a GATKSAMRecord * @return true if the read passes the filter, false if it doesn't */ public boolean filter(ReferenceContext ref, GATKSAMRecord read) { @@ -208,11 +214,11 @@ public class PrintReadsWalker extends ReadWalker { * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a SAMRecord + * @param read the read itself, as a GATKSAMRecord * @return the read itself */ - public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { - return read; + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { + return simplifyReads ? read.simplify() : read; } /** @@ -232,7 +238,7 @@ public class PrintReadsWalker extends ReadWalker { * @param output the output source * @return the SAMFileWriter, so that the next reduce can emit to the same source */ - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { + public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) { output.addAlignment(read); return output; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 6264808f4..18c383ed9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -114,35 +114,6 @@ public abstract class Walker { return false; } - /** - * This method states whether you want to see pileups of "extended events" (currently, indels only) - * at every locus that has at least one indel associated with it. Consider the following situation: - * - * ref: AT--CTGA (note that we expanded the ref here with -- to accomodate insertion in read3) - * read1: AT--CTGA (perfectly matches the ref) - * read2: AT----GA (deletion -CT w.r.t. the ref) - * read3: ATGGCTGA (insertion +GG w.r.t the ref) - * - * Normally, the locus iterator only returns read base pileups over reference bases, optionally with deleted bases - * included (see #includeReadsWithDeletionAtLoci()). In other words, the pileup over the second reference base (T) - * will be [T,T,T] (all reads count), for the next reference base (C) the pileup will be [C,C] (or [C,-,C] if - * #includeReadsWithDeletionAtLoci() is true), next pileup generated over the next reference - * base (T) will be either [T,T], or [T,'-',T], etc. In this default mode, a) insertions are not seen by a walker at all, and - * b) deletions are (optionally) seen only on a base-by-base basis (as the step-by-step traversal over the reference - * bases is performed). In the extended event mode, however, if there is at least one indel associated with a reference - * locus, the engine will generate an additional call to the walker's map() method, with a pileup of - * full-length extended indel/noevent calls. This call will be made after the conventional base pileup call - * at that locus. Thus, in the example above, a conventional call will be first made at the second reference base (T), - * with the [T,T,T] pileup of read bases, then an extended event call will be made at the same locus with - * pileup [no_event, -CT, +GG] (i.e. extended events associated with that reference base). After that, the traversal - * engine will move to the next reference base. - * - * @return false if you do not want to receive extra pileups with extended events, or true if you do. - */ - public boolean generateExtendedEvents() { - return false; - } - public void initialize() { } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 833107bd3..04c7ab756 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -30,10 +30,12 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,72 +51,101 @@ import java.util.Map; */ public class AlleleBalance extends InfoFieldAnnotation { + + char[] BASES = {'A','C','G','T'}; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; - + if ( !vc.isBiallelic() ) return null; final GenotypesContext genotypes = vc.getGenotypes(); if ( !vc.hasGenotypes() ) return null; - double ratio = 0.0; - double totalWeights = 0.0; + double ratioHom = 0.0; + double ratioHet = 0.0; + double weightHom = 0.0; + double weightHet = 0.0; + double overallNonDiploid = 0.0; for ( Genotype genotype : genotypes ) { // we care only about het calls - if ( !genotype.isHet() ) - continue; AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) + if ( context == null || !context.hasBasePileup() ) continue; - if ( vc.isSNP() && context.hasBasePileup() ) { - final String bases = new String(context.getBasePileup().getBases()); + final ReadBackedPileup pileup = context.getBasePileup(); + if ( vc.isSNP() ) { + final String bases = new String(pileup.getBases()); if ( bases.length() == 0 ) return null; - char refChr = vc.getReference().toString().charAt(0); - char altChr = vc.getAlternateAllele(0).toString().charAt(0); - int refCount = MathUtils.countOccurrences(refChr, bases); - int altCount = MathUtils.countOccurrences(altChr, bases); + double pTrue = 1.0 - Math.pow(10.0,genotype.getLog10PError()); + if ( genotype.isHet() ) { + final char refChr = vc.getReference().toString().charAt(0); + final char altChr = vc.getAlternateAllele(0).toString().charAt(0); - // sanity check - if ( refCount + altCount == 0 ) - continue; + final int refCount = MathUtils.countOccurrences(refChr, bases); + final int altCount = MathUtils.countOccurrences(altChr, bases); + final int otherCount = bases.length()-refCount-altCount; - // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much - ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount)); - totalWeights += genotype.getLog10PError(); - } else if ( vc.isIndel() && context.hasExtendedEventPileup() ) { - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - if ( indelPileup == null ) { - continue; + // sanity check + if ( refCount + altCount == 0 ) + continue; + + // weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much + ratioHet += pTrue * ((double)refCount / (double)(refCount + altCount)); + weightHet += pTrue; + overallNonDiploid += ( (double) otherCount )/(bases.length()*genotypes.size()); + } else if ( genotype.isHom() ) { + char alleleChr; + if ( genotype.isHomRef() ) { + alleleChr = vc.getReference().toString().charAt(0); + } else { + alleleChr = vc.getAlternateAllele(0).toString().charAt(0); + } + final int alleleCount = MathUtils.countOccurrences(alleleChr,bases); + int bestOtherCount = 0; + for ( char b : BASES ) { + if ( b == alleleChr ) + continue; + int count = MathUtils.countOccurrences(b,bases); + if ( count > bestOtherCount ) + bestOtherCount = count; + } + final int otherCount = bases.length() - alleleCount; + ratioHom += pTrue*( (double) alleleCount)/(alleleCount+bestOtherCount); + weightHom += pTrue; + overallNonDiploid += ((double ) otherCount)/(bases.length()*genotypes.size()); } - // todo -- actually care about indel length from the pileup (agnostic at the moment) - int refCount = indelPileup.getNumberOfElements(); - int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); - - if ( refCount + altCount == 0 ) { - continue; - } - - ratio += /* todo -- make not uniform */ 1 * ((double) refCount) / (double) (refCount + altCount); - totalWeights += 1; + // Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of + // prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from + // scratch, be my guest - but make sure it's done correctly! [EB] } } // make sure we had a het genotype - if ( MathUtils.compareDoubles(totalWeights, 0.0) == 0 ) - return null; Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.3f", (ratio / totalWeights))); + if ( weightHet > 0.0 ) { + map.put("ABHet",ratioHet/weightHet); + } + + if ( weightHom > 0.0 ) { + map.put("ABHom",ratioHom/weightHom); + } + + if ( overallNonDiploid > 0.0 ) { + map.put("OND",overallNonDiploid); + } return map; } - public List getKeyNames() { return Arrays.asList("AB"); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("AB", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))")); } + public List getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))"), + new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homs (A/(A+O))"), + new VCFInfoHeaderLine("OND", 1, VCFHeaderLineType.Float, "Overall non-diploid ratio (alleles/(alleles+non-alleles))")); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 97a4ac468..8bc5f06f4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -5,16 +5,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** - * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). + * The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). * Note that the base quality rank sum test can not be calculated for homozygous sites. */ public class BaseQualityRankSumTest extends RankSumTest { @@ -31,8 +29,31 @@ public class BaseQualityRankSumTest extends RankSumTest { altQuals.add((double)p.getQual()); } } - } + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { + // TODO -- implement me; how do we pull out the correct offset from the read? + return; + +/* + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + + if ( isUsableBase(p) ) { + if ( matchesRef ) + refQuals.add((double)p.getQual()); + else + altQuals.add((double)p.getQual()); + } + } + } +*/ + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 0acd3e841..057dba1f7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -25,9 +25,11 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -35,13 +37,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -49,17 +52,31 @@ import java.util.Map; * allele Frequency, for each ALT allele, in the same order as listed; total number * of alleles in called genotypes. */ -public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation { +public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes") }; + private Set founderIds = new HashSet(); + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( ! vc.hasGenotypes() ) return null; + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true,founderIds); + } + + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set headerLines ){ + //If families were given, get the founders ids + founderIds = ((Walker)walker).getSampleDB().getFounderIds(); + } + + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( ! vc.hasGenotypes() ) + return null; + return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index ab38b69cd..f94d48893 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -33,7 +36,7 @@ import java.util.Map; * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with * -dcov D is N * D */ -public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { +public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -41,7 +44,23 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : sample.getValue().getExtendedEventPileup().depthOfCoverage(); + depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : 0; + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", depth)); + return map; + } + + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List alleleBin : alleleBins.values() ) { + depth += alleleBin.size(); + } + } + Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%d", depth)); return map; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 5d706d9c5..acb1e378a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -9,9 +9,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -44,9 +42,9 @@ import java.util.Map; */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { - private static String REF_ALLELE = "REF"; + private static final String REF_ALLELE = "REF"; - private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time + private static final String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { if ( g == null || !g.isCalled() ) @@ -62,7 +60,8 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa private Map annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) { - if ( ! stratifiedContext.hasBasePileup() ) return null; + if ( ! stratifiedContext.hasBasePileup() ) + return null; HashMap alleleCounts = new HashMap(); for ( Allele allele : vc.getAlleles() ) @@ -87,17 +86,16 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa private Map annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) { - if ( ! stratifiedContext.hasExtendedEventPileup() ) { + if ( ! stratifiedContext.hasBasePileup() ) return null; - } - ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup(); + ReadBackedPileup pileup = stratifiedContext.getBasePileup(); if ( pileup == null ) return null; - HashMap alleleCounts = new HashMap(); - alleleCounts.put(REF_ALLELE,0); - Allele refAllele = vc.getReference(); + final HashMap alleleCounts = new HashMap(); + alleleCounts.put(REF_ALLELE, 0); + final Allele refAllele = vc.getReference(); for ( Allele allele : vc.getAlternateAlleles() ) { @@ -108,33 +106,24 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa alleleCounts.put(getAlleleRepresentation(allele), 0); } - for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) { - if ( e.isInsertion() ) { + for ( PileupElement p : pileup ) { + if ( p.isBeforeInsertion() ) { - final String b = e.getEventBases(); + final String b = p.getEventBases(); if ( alleleCounts.containsKey(b) ) { alleleCounts.put(b, alleleCounts.get(b)+1); } - } else { - if ( e.isDeletion() ) { - if ( e.getEventLength() == refAllele.length() ) { + } else if ( p.isBeforeDeletionStart() ) { + if ( p.getEventLength() == refAllele.length() ) { // this is indeed the deletion allele recorded in VC final String b = DEL; if ( alleleCounts.containsKey(b) ) { alleleCounts.put(b, alleleCounts.get(b)+1); } } -// else { -// System.out.print(" deletion of WRONG length found"); -// } - } - else { - if ( e.getRead().getAlignmentEnd() <= vc.getStart() ) { - continue; - } - alleleCounts.put(REF_ALLELE,alleleCounts.get(REF_ALLELE)+1); - } + } else if ( p.getRead().getAlignmentEnd() > vc.getStart() ) { + alleleCounts.put(REF_ALLELE, alleleCounts.get(REF_ALLELE)+1); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 6a825cba7..8af69d862 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -28,6 +28,7 @@ import cern.jet.math.Arithmetic; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,7 +51,7 @@ import java.util.*; * indicative of false positive calls. Note that the fisher strand test may not be * calculated for certain complex indel cases or for multi-allelic sites. */ -public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; @@ -78,6 +80,22 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( !vc.isVariant() ) + return null; + + final int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + + final Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); + if ( pvalue == null ) + return null; + + final Map map = new HashMap(); + map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); + return map; + + } + public List getKeyNames() { return Arrays.asList(FS); } @@ -193,6 +211,38 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return sum; } + /** + Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: + * fw rc + * allele1 # # + * allele2 # # + * @return a 2x2 contingency table + */ + private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + int[][] table = new int[2][2]; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alt.equals(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + boolean isFW = read.getReadNegativeStrandFlag(); + + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; + + table[row][column]++; + } + } + } + + return table; + } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -214,8 +264,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat Allele base = Allele.create(p.getBase(), false); boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - boolean matchesRef = ref.equals(base, true); - boolean matchesAlt = alt.equals(base, true); + final boolean matchesRef = ref.equals(base, true); + final boolean matchesAlt = alt.equals(base, true); if ( matchesRef || matchesAlt ) { int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; @@ -227,6 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } + /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -245,24 +296,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( String sample : stratifiedContexts.keySet() ) { final AlignmentContext context = stratifiedContexts.get(sample); - if ( context == null ) + if ( context == null || !context.hasBasePileup() ) continue; - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup == null) - continue; - - for (final PileupElement p: pileup) { + final ReadBackedPileup pileup = context.getBasePileup(); + for ( final PileupElement p : pileup ) { if ( p.getRead().isReducedRead() ) // ignore reduced reads continue; - if ( p.getRead().getMappingQuality() < 20) + if ( p.getRead().getMappingQuality() < 20 ) continue; - if (indelLikelihoodMap.containsKey(p)) { + if ( indelLikelihoodMap.containsKey(p) ) { // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index f323a7be2..6abfdc7d2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -64,6 +64,9 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; + if (!vc.isSNP() && !vc.isIndel() && !vc.isMixed()) + return null; + final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); final int contextWingSize = Math.min((ref.getWindow().size() - 1) / 2, MIN_CONTEXT_WING_SIZE); @@ -71,41 +74,27 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2; - // Compute all haplotypes consistent with the current read pileup - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup == null) + if ( !context.hasBasePileup() ) return null; + final ReadBackedPileup pileup = context.getBasePileup(); + + // Compute all haplotypes consistent with the current read pileup final List haplotypes = computeHaplotypes(pileup, contextSize, locus, vc); final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); if (haplotypes != null) { for (final Genotype genotype : vc.getGenotypes()) { final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); - if (thisContext != null) { - final ReadBackedPileup thisPileup; - if (thisContext.hasExtendedEventPileup()) - thisPileup = thisContext.getExtendedEventPileup(); - else if (thisContext.hasBasePileup()) - thisPileup = thisContext.getBasePileup(); - else - thisPileup = null; - - if (thisPileup != null) { - if (vc.isSNP()) - scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - else if (vc.isIndel() || vc.isMixed()) { - Double d = scoreIndelsAgainstHaplotypes(thisPileup); - if (d == null) - return null; - scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - } else + if (thisContext != null && thisContext.hasBasePileup()) { + final ReadBackedPileup thisPileup = thisContext.getBasePileup(); + if (vc.isSNP()) + scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + else if (vc.isIndel() || vc.isMixed()) { + Double d = scoreIndelsAgainstHaplotypes(thisPileup); + if (d == null) return null; + scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 6366890d5..57561a277 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -27,12 +30,19 @@ import java.util.Map; * more information. Note that the Inbreeding Coefficient will not be calculated for files * with fewer than a minimum (generally 10) number of samples. */ -public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation { +public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + return calculateIC(vc); + } + + private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index aa4f26ef3..4ce19e824 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -6,16 +6,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** - * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) + * The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) * Note that the mapping quality rank sum test can not be calculated for homozygous sites. */ public class MappingQualityRankSumTest extends RankSumTest { @@ -35,6 +33,23 @@ public class MappingQualityRankSumTest extends RankSumTest { } } } + + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + if ( matchesRef ) + refQuals.add((double)read.getMappingQuality()); + else + altQuals.add((double)read.getMappingQuality()); + } + } + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index 3a3efc4e8..191c00a32 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -30,14 +30,9 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA int mq0 = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup != null) { + final AlignmentContext context = sample.getValue(); + if ( context.hasBasePileup() ) { + final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 ) mq0++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index f14d7a8a5..b1c037ba3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -53,14 +53,8 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation { return null; int mq0 = 0; - ReadBackedPileup pileup = null; - if (vc.isIndel() && context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - else return null; - - if (pileup != null) { + if ( context.hasBasePileup() ) { + final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 ) mq0++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java index 2164537b8..1315a6c52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java @@ -31,13 +31,8 @@ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements E for ( Map.Entry sample : stratifiedContexts.entrySet() ) { AlignmentContext context = sample.getValue(); depth += context.size(); - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup != null) { + if ( context.hasBasePileup() ) { + final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 ) mq0++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 6638fc7a8..24a107235 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -3,11 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -23,7 +26,7 @@ import java.util.Map; * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing * reads associated with the samples with polymorphic genotypes. */ -public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -45,7 +48,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( context == null ) continue; - depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : context.getExtendedEventPileup().depthOfCoverage(); + depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : 0; } if ( depth == 0 ) @@ -62,4 +65,40 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + int depth = 0; + + for ( final Genotype genotype : genotypes ) { + + // we care only about variant calls with likelihoods + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); + if ( alleleBins == null ) + continue; + + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + if ( !alleleBin.getKey().equals(Allele.NO_CALL) ) + depth += alleleBin.getValue().size(); + } + } + + if ( depth == 0 ) + return null; + + double QD = -10.0 * vc.getLog10PError() / (double)depth; + + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 40f6d20d3..ea7d6ae33 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -13,6 +14,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -24,7 +27,7 @@ import java.util.Map; /** * Root Mean Square of the mapping quality of the reads across all samples. */ -public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation { +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -34,18 +37,13 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( AlignmentContext context : stratifiedContexts.values() ) totalSize += context.size(); - int[] qualities = new int[totalSize]; + final int[] qualities = new int[totalSize]; int index = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { AlignmentContext context = sample.getValue(); - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup != null) { + if ( context.hasBasePileup() ) { + final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) qualities[index++] = p.getMappingQual(); @@ -59,6 +57,34 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if ( stratifiedContexts.size() == 0 ) + return null; + + int depth = 0; + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + depth += alleleBin.getValue().size(); + } + } + + final int[] qualities = new int[depth]; + int index = 0; + + for ( final Map> alleleBins : stratifiedContexts.values() ) { + for ( final List reads : alleleBins.values() ) { + for ( final GATKSAMRecord read : reads ) { + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[index++] = read.getMappingQuality(); + } + } + } + + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.2f", MathUtils.rms(qualities))); + return map; + } + public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 00968943d..ad9600edf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -12,6 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -26,7 +28,7 @@ import java.util.Map; /** * Abstract root for all RankSum based annotations */ -public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation { +public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; @@ -38,7 +40,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (genotypes == null || genotypes.size() == 0) return null; - final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); @@ -62,12 +63,10 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar continue; } - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); + if (!context.hasBasePileup()) + continue; + final ReadBackedPileup pileup = context.getBasePileup(); if (pileup == null) continue; @@ -106,12 +105,52 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar if (!Double.isNaN(testResults.first)) map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); return map; - } - protected abstract void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals); + public Map annotate(Map>> stratifiedContexts, VariantContext vc) { + if (stratifiedContexts.size() == 0) + return null; - protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals); + final GenotypesContext genotypes = vc.getGenotypes(); + if (genotypes == null || genotypes.size() == 0) + return null; + + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); + + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + final Map> context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + + fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), vc.getStart(), context, refQuals, altQuals); + } + + if ( refQuals.size() == 0 || altQuals.size() == 0 ) + return null; + + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + for (final Double qual : altQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); + } + for (final Double qual : refQuals) { + mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); + } + + // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) + final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); + + final Map map = new HashMap(); + if (!Double.isNaN(testResults.first)) + map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); + return map; + } + + protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, List altQuals); + + protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); + + protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); protected static boolean isUsableBase(final PileupElement p) { return !(p.isInsertionAtBeginningOfRead() || diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java deleted file mode 100644 index 168fbdc49..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Unsupported - */ -@Hidden -public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { - - private static String REF_ALLELE = "REF"; - - private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time - - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, - AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { - if ( g == null || !g.isCalled() ) - return null; - - if ( vc.isSNP() ) - return annotateSNP(stratifiedContext, vc); - if ( vc.isIndel() ) - return annotateIndel(stratifiedContext, vc); - - return null; - } - - private Map annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) { - - if ( ! stratifiedContext.hasBasePileup() ) return null; - - HashMap alleleCounts = new HashMap(); - for ( Allele allele : vc.getAlternateAlleles() ) - alleleCounts.put(allele.getBases()[0], 0); - - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - int totalDepth = pileup.getNumberOfElements(); - - Map map = new HashMap(); - map.put(getKeyNames().get(0), totalDepth); // put total depth in right away - - if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!! - - int mq0 = 0; // number of "ref" reads that are acually mq0 - for ( PileupElement p : pileup ) { - if ( p.getMappingQual() == 0 ) { - mq0++; - continue; - } - if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); - } - - if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do - - // we need to add counts in the correct order - String[] fracs = new String[alleleCounts.size()]; - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) { - fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0)); - } - - map.put(getKeyNames().get(1), fracs); - return map; - } - - private Map annotateIndel(AlignmentContext - stratifiedContext, VariantContext - vc) { - - if ( ! stratifiedContext.hasExtendedEventPileup() ) { - return null; - } - - ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup(); - if ( pileup == null ) - return null; - int totalDepth = pileup.getNumberOfElements(); - - Map map = new HashMap(); - map.put(getKeyNames().get(0), totalDepth); // put total depth in right away - - if ( totalDepth == 0 ) return map; - int mq0 = 0; // number of "ref" reads that are acually mq0 - - HashMap alleleCounts = new HashMap(); - Allele refAllele = vc.getReference(); - - for ( Allele allele : vc.getAlternateAlleles() ) { - - if ( allele.isNoCall() ) { - continue; // this does not look so good, should we die??? - } - - alleleCounts.put(getAlleleRepresentation(allele), 0); - } - - for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) { - - if ( e.getMappingQual() == 0 ) { - mq0++; - continue; - } - - if ( e.isInsertion() ) { - - final String b = e.getEventBases(); - if ( alleleCounts.containsKey(b) ) { - alleleCounts.put(b, alleleCounts.get(b)+1); - } - - } else { - if ( e.isDeletion() ) { - if ( e.getEventLength() == refAllele.length() ) { - // this is indeed the deletion allele recorded in VC - final String b = DEL; - if ( alleleCounts.containsKey(b) ) { - alleleCounts.put(b, alleleCounts.get(b)+1); - } - } -// else { -// System.out.print(" deletion of WRONG length found"); -// } - } - } - } - - if ( mq0 == totalDepth ) return map; - - String[] fracs = new String[alleleCounts.size()]; - for (int i = 0; i < vc.getAlternateAlleles().size(); i++) - fracs[i] = String.format("%.3f", - ((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0)); - - map.put(getKeyNames().get(1), fracs); - - //map.put(getKeyNames().get(0), counts); - return map; - } - - private String getAlleleRepresentation(Allele allele) { - if ( allele.isNull() ) { // deletion wrt the ref - return DEL; - } else { // insertion, pass actual bases - return allele.getBaseString(); - } - - } - - // public String getIndelBases() - public List getKeyNames() { return Arrays.asList("DP","FA"); } - - public List getDescriptions() { - return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), - 1, - VCFHeaderLineType.Integer, - "Total read depth per sample, including MQ0"), - new VCFFormatHeaderLine(getKeyNames().get(1), - VCFHeaderLineCount.UNBOUNDED, - VCFHeaderLineType.Float, - "Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample")); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index a998cd08b..92e6f8536 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -11,15 +11,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; /** - * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). + * The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). * Note that the read position rank sum test can not be calculated for homozygous sites. */ public class ReadPosRankSumTest extends RankSumTest { @@ -49,6 +48,31 @@ public class ReadPosRankSumTest extends RankSumTest { } } + protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { + for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { + final boolean matchesRef = ref.equals(alleleBin.getKey()); + final boolean matchesAlt = alts.contains(alleleBin.getKey()); + if ( !matchesRef && !matchesAlt ) + continue; + + for ( final GATKSAMRecord read : alleleBin.getValue() ) { + final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getUnclippedStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) + continue; + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); + + final int numAlignedBases = AlignmentUtils.getNumAlignedBases( read ); + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + + if ( matchesRef ) + refQuals.add((double) readPos); + else + altQuals.add((double) readPos); + } + } + } + protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index 66d2ad318..2d97f5d54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -35,13 +35,8 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { AlignmentContext context = sample.getValue(); - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup != null) { + if ( context.hasBasePileup() ) { + final ReadBackedPileup pileup = context.getBasePileup(); deletions += pileup.getNumberOfDeletions(); depth += pileup.getNumberOfElements(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java index 1f5508f4c..e7c3bbaad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -39,15 +39,9 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi for ( Map.Entry sample : stratifiedContexts.entrySet() ) { AlignmentContext context = sample.getValue(); - - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); - - if (pileup != null) { - for (PileupElement p : pileup ) { + if ( context.hasBasePileup() ) { + final ReadBackedPileup pileup = context.getBasePileup(); + for ( PileupElement p : pileup ) { if(ReadUtils.is454Read(p.getRead())) reads454++; else if (ReadUtils.isSOLiDRead(p.getRead())) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 5312c4136..976f601ab 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; @@ -168,20 +167,14 @@ public class VariantAnnotator extends RodWalker implements Ann protected Boolean ALWAYS_APPEND_DBSNP_ID = false; public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; } - @Hidden - @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) - protected boolean indelsOnly = false; - @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio") public double minGenotypeQualityP = 0.0; @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false) - private boolean requireStrictAlleleMatch = false; + protected boolean requireStrictAlleleMatch = false; private VariantAnnotatorEngine engine; - private Collection indelBufferContext; - private void listAnnotationsAndExit() { System.out.println("\nStandard annotations in the list below are marked with a '*'."); @@ -240,7 +233,7 @@ public class VariantAnnotator extends RodWalker implements Ann for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { if ( line instanceof VCFInfoHeaderLine ) { VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; - if ( infoline.getName().equals(expression.fieldName) ) { + if ( infoline.getID().equals(expression.fieldName) ) { targetHeaderLine = infoline; break; } @@ -261,10 +254,6 @@ public class VariantAnnotator extends RodWalker implements Ann VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); - - if ( indelsOnly ) { - indelBufferContext = null; - } } public static boolean isUniqueHeaderLine(VCFHeaderLine line, Set currentSet) { @@ -294,13 +283,6 @@ public class VariantAnnotator extends RodWalker implements Ann */ public boolean includeReadsWithDeletionAtLoci() { return true; } - /** - * We want to see extended events if annotating indels - * - * @return true - */ - public boolean generateExtendedEvents() { return indelsOnly; } - /** * For each site of interest, annotate based on the requested annotation types * @@ -322,31 +304,16 @@ public class VariantAnnotator extends RodWalker implements Ann // if the reference base is not ambiguous, we can annotate Map stratifiedContexts; if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { - if ( ! context.hasExtendedEventPileup() ) { + if ( context.hasBasePileup() ) { stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup()); - } else { - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup()); - } - if ( stratifiedContexts != null ) { annotatedVCs = new ArrayList(VCs.size()); for ( VariantContext vc : VCs ) annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } } - if ( ! indelsOnly ) { - for ( VariantContext annotatedVC : annotatedVCs ) - vcfWriter.add(annotatedVC); - } else { - // check to see if the buffered context is different (in location) this context - if ( indelBufferContext != null && ! VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),indelBufferContext.iterator().next()).equals(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),annotatedVCs.iterator().next())) ) { - for ( VariantContext annotatedVC : indelBufferContext ) - vcfWriter.add(annotatedVC); - indelBufferContext = annotatedVCs; - } else { - indelBufferContext = annotatedVCs; - } - } + for ( VariantContext annotatedVC : annotatedVCs ) + vcfWriter.add(annotatedVC); return 1; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 90d0ad740..413c32a24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -33,10 +33,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -94,6 +92,13 @@ public class VariantAnnotatorEngine { initializeDBs(); } + // experimental constructor for active region traversal + public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) { + this.walker = null; + this.toolkit = toolkit; + requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.emptyList()); + } + // select specific expressions to use public void initializeExpressions(List expressionsToUse) { // set up the expressions @@ -169,7 +174,7 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -192,6 +197,20 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } + public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + + // go through all the requested info annotationTypes + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); + } + + // generate a new annotated VC + return new VariantContextBuilder(vc).attributes(infoAnnotations).make(); + } + private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java new file mode 100755 index 000000000..de61c7741 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; + +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; + +// TODO -- make this an abstract class when we move away from InfoFieldAnnotation +public interface ActiveRegionBasedAnnotation extends AnnotationType { + // return annotations for the given contexts split by sample and then allele + public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); + + // return the descriptions used for the VCF INFO meta field + public abstract List getDescriptions(); +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java new file mode 100755 index 000000000..d91ddd221 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.List; + +/** + * User: carneiro + * Date: 3/29/11 + */ + + +public class BQSRGatherer extends Gatherer { + + private static final String EMPTY_INPUT_LIST = "list of inputs files is empty"; + private static final String MISSING_OUTPUT_FILE = "missing output file name"; + + @Override + public void gather(List inputs, File output) { + RecalibrationReport generalReport = null; + PrintStream outputFile; + try { + outputFile = new PrintStream(output); + } catch(FileNotFoundException e) { + throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); + } + + for (File input : inputs) { + RecalibrationReport inputReport = new RecalibrationReport(input); + if (generalReport == null) + generalReport = inputReport; + else + generalReport.combine(inputReport); + } + if (generalReport == null) + throw new ReviewedStingException(EMPTY_INPUT_LIST); + + generalReport.calculateEmpiricalAndQuantizedQualities(); + + RecalibrationArgumentCollection RAC = generalReport.getRAC(); + if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { + File recal_out = new File(output.getName() + ".original"); + RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); + RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + } + else if (!RAC.NO_PLOTS) { + File recal_out = new File(output.getName() + ".recal"); + RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES); + } + + generalReport.output(outputFile); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java new file mode 100644 index 000000000..1cb02f1c1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -0,0 +1,341 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.BitSetUtils; + +import java.util.*; + +/** + * This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR + * + * It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will + * add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information + * out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type. + * + * The keys represented by this key manager will always have the same order: + * + * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType + * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType + * ... + * RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType + * + * + * Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary. + * + * @author Mauricio Carneiro + * @since 3/6/12 + */ +public class BQSRKeyManager { + private final List requiredCovariates; + private final List optionalCovariates; + private final Map covariateNameToIDMap; + + private int nRequiredBits; // Number of bits used to represent the required covariates + private int nOptionalBits; // Number of bits used to represent the standard covaraites + private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs + private final int totalNumberOfBits; // Sum of all of the above plus the event bits + + private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset + private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset + + /** + * Initializes the KeyManager with the total number of covariates to use + * + * @param requiredCovariates the ordered list of required covariates + * @param optionalCovariates the ordered list of optional covariates + */ + public BQSRKeyManager(List requiredCovariates, List optionalCovariates) { + this.requiredCovariates = new ArrayList(requiredCovariates.size()); // initialize the required covariates list + this.optionalCovariates = new ArrayList(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay) + this.covariateNameToIDMap = new HashMap(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) + + nRequiredBits = 0; + for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management + int nBits = required.numberOfBits(); // number of bits used by this covariate + BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate + this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate + nRequiredBits += nBits; + } + + short id = 0; + nOptionalBits = 0; + for (Covariate optional : optionalCovariates) { + int nBits = optional.numberOfBits(); // number of bits used by this covariate + nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate + BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate + this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object + String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + this.covariateNameToIDMap.put(covariateName, id); + id++; + } + + nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID + optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset + optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset + totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key + } + + /** + * Generates one key per optional covariate. + * + * Keys include all required covariates, the standard covariate and the event type. + * + * Example allKeys: + * RG, QUAL, CYCLE, CONTEXT + * + * List of BitSets returned by this example (given eventType): + * RG, QUAL, CYCLE, EVENT + * RG, QUAL, CONTEXT, EVENT + * + * Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type + * + * @param allKeys The keys in bitset representation for each covariate + * @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions) + * @return one key in bitset representation per covariate + */ + public List bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) { + List allBitSets = new LinkedList(); // Generate one key per optional covariate + + BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type + int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits + + int covariateIndex = 0; + BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on + for (RequiredCovariateInfo infoRequired : requiredCovariates) + addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set + + for (OptionalCovariateInfo infoOptional : optionalCovariates) { + BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys + if (covariateKey == null) + continue; // do not add nulls to the final set of keys. + + BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate + optionalKey.or(requiredKey); // import all the required covariates + addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates + addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite + addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type + allBitSets.add(optionalKey); // add this key to the list of keys + } + + if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key) + addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type + allBitSets.add(requiredKey); // add this key to the list of keys + } + + return allBitSets; + } + + /** + * Generates one bitset key for the covariates represented in Object[] key + * + * The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file) + * and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many. + * + * Example key: + * RG, QUAL, CYCLE, CYCLE_ID, EventType + * + * @param key list of objects produced by the required covariates followed by one or zero optional covariates. + * @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface. + */ + public BitSet bitSetFromKey(Object[] key) { + BitSet bitSetKey = new BitSet(totalNumberOfBits); + + int requiredCovariate = 0; + for (RequiredCovariateInfo infoRequired : requiredCovariates) { + BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface + addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key + } + + if (optionalCovariates.size() > 0) { + int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array + int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's + int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index + OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information + + BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface + addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates + addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite + } + + int eventIndex = key.length - 1; // the event type is always the last key + int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits + BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type + addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type + + return bitSetKey; + } + + /** + * Covariate id can be either the covariate name (String) or the actual id (short). This method + * finds it's type and converts accordingly to the short notation. + * + * @param id the string or short representation of the optional covariate id + * @return the short representation of the optional covariate id. + */ + private short parseCovariateID(Object id) { + return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id; + } + + /** + * Generates a key set of objects from a combined bitset key. + * + * Masks out each covariate independently and decodes their values (Object) into a keyset + * + * @param key the bitset representation of the keys + * @return an object array with the values for each key + */ + public List keySetFrom(BitSet key) { + List objectKeys = new ArrayList(); + for (RequiredCovariateInfo info : requiredCovariates) { + BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset + objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface + } + + if (optionalCovariates.size() > 0) { + BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set + BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is) + short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short + Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object + objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set + objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id + } + objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set + + return objectKeys; + } + + public List getRequiredCovariates() { + ArrayList list = new ArrayList(requiredCovariates.size()); + for (RequiredCovariateInfo info : requiredCovariates) + list.add(info.covariate); + return list; + } + + public List getOptionalCovariates() { + ArrayList list = new ArrayList(optionalCovariates.size()); + for (OptionalCovariateInfo info : optionalCovariates) + list.add(info.covariate); + return list; + } + + /** + * Translates a masked bitset into a bitset starting at 0 + * + * @param key the masked out bitset + * @param n the number of bits to chop + * @return a translated bitset starting at 0 for the covariate machinery to decode + */ + private BitSet chopNBitsFrom(BitSet key, int n) { + BitSet choppedKey = new BitSet(); + for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1)) + choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet + return choppedKey; + } + + /** + * Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key + * + * @param leadingBits the index of the covariate in the ordered covariate list + * @param nBits the number of bits needed by the Covariate to represent its values in BitSet form + * @return the bitset relevant to the covariate + */ + + private BitSet genericMask(int leadingBits, int nBits) { + BitSet mask = new BitSet(leadingBits + nBits); + mask.set(leadingBits, leadingBits + nBits); + return mask; + } + + /** + * Decodes the event type (enum) from the full bitset key + * + * @param fullKey the full key of all covariates + event type + * @return the decoded event type. + */ + private EventType eventFromBitSet(BitSet fullKey) { + BitSet eventKey = new BitSet(); + int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; + for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1)) + eventKey.set(i - firstBitIndex); + return EventType.eventFrom(BitSetUtils.shortFrom(eventKey)); + } + + private BitSet bitSetFromEvent(EventType eventType) { + return BitSetUtils.bitSetFrom(eventType.index); + } + + private int bitsInEventType() { + return BitSetUtils.numberOfBitsToRepresent(EventType.values().length); + } + + private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) { + for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1)) + key.set(j + location); // translate the bits set in the key to their corresponding position in the full key + } + + private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) { + BitSet bitSet = (BitSet) key.clone(); + bitSet.and(mask); + return chopNBitsFrom(bitSet, leadingBits); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof BQSRKeyManager)) + return false; + + BQSRKeyManager other = (BQSRKeyManager) o; + if (this == other) + return true; + + if (requiredCovariates.size() != other.requiredCovariates.size() || optionalCovariates.size() != other.optionalCovariates.size()) + return false; + + Iterator otherRequiredIterator = other.requiredCovariates.iterator(); + for (RequiredCovariateInfo thisInfo: requiredCovariates) { + RequiredCovariateInfo otherInfo = otherRequiredIterator.next(); + + String thisName = thisInfo.covariate.getClass().getSimpleName(); + String otherName = otherInfo.covariate.getClass().getSimpleName(); + if (!thisName.equals(otherName)) + return false; + } + + Iterator otherOptionalIterator = other.optionalCovariates.iterator(); + for (OptionalCovariateInfo thisInfo : optionalCovariates) { + OptionalCovariateInfo otherInfo = otherOptionalIterator.next(); + String thisName = thisInfo.covariate.getClass().getSimpleName(); + String otherName = otherInfo.covariate.getClass().getSimpleName(); + if (!thisName.equals(otherName)) + return false; + } + + return true; + } + + + /** + * Aggregate information for each Covariate + */ + class RequiredCovariateInfo { + public final int bitsBefore; // number of bits before this covariate in the combined bitset key + public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) + public final Covariate covariate; // this allows reverse lookup of the Covariates in order + + RequiredCovariateInfo(int bitsBefore, BitSet mask, Covariate covariate) { + this.bitsBefore = bitsBefore; + this.mask = mask; + this.covariate = covariate; + } + } + + class OptionalCovariateInfo { + public final BitSet covariateID; // cache the covariate ID + public final Covariate covariate; + + OptionalCovariateInfo(BitSet covariateID, Covariate covariate) { + this.covariateID = covariateID; + this.covariate = covariate; + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index a1ab73341..c5aabc64d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.BitSetUtils; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -43,7 +45,9 @@ public class ContextCovariate implements StandardCovariate { private int mismatchesContextSize; private int insertionsContextSize; - private int deletionsContextSize; + private int deletionsContextSize; + + private byte LOW_QUAL_TAIL; // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -52,18 +56,22 @@ public class ContextCovariate implements StandardCovariate { insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE; deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE; + LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; + if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0) throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize)); } @Override - public CovariateValues getValues(final GATKSAMRecord read) { + public CovariateValues getValues(GATKSAMRecord read) { int l = read.getReadLength(); BitSet[] mismatches = new BitSet[l]; BitSet[] insertions = new BitSet[l]; - BitSet[] deletions = new BitSet[l]; + BitSet[] deletions = new BitSet[l]; + read = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context + final boolean negativeStrand = read.getReadNegativeStrandFlag(); byte[] bases = read.getReadBases(); if (negativeStrand) @@ -72,7 +80,7 @@ public class ContextCovariate implements StandardCovariate { for (int i = 0; i < read.getReadLength(); i++) { mismatches[i] = contextWith(bases, i, mismatchesContextSize); insertions[i] = contextWith(bases, i, insertionsContextSize); - deletions[i] = contextWith(bases, i, deletionsContextSize); + deletions[i] = contextWith(bases, i, deletionsContextSize); } if (negativeStrand) { @@ -89,24 +97,41 @@ public class ContextCovariate implements StandardCovariate { return str; } + @Override + public String keyFromBitSet(BitSet key) { + if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file + return null; + + return BitSetUtils.dnaFrom(key); + } + + @Override + public BitSet bitSetFromKey(Object key) { + return BitSetUtils.bitSetFrom((String) key); + } + + @Override + public int numberOfBits() { + return Long.bitCount(-1L); + } + /** - * calculates the context of a base independent of the covariate mode + * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) * - * @param bases the bases in the read to build the context from - * @param offset the position in the read to calculate the context for - * @param contextSize context size to use building the context - * @return + * @param bases the bases in the read to build the context from + * @param offset the position in the read to calculate the context for + * @param contextSize context size to use building the context + * @return the bitSet representing the Context */ - private BitSet contextWith(byte [] bases, int offset, int contextSize) { - if (offset < contextSize) - return null; - - String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); - if (context.contains("N")) - return null; - - return MathUtils.bitSetFrom(context); - } + private BitSet contextWith(byte[] bases, int offset, int contextSize) { + BitSet result = null; + if (offset - contextSize + 1 >= 0) { + String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1)); + if (!context.contains("N")) + result = BitSetUtils.bitSetFrom(context); + } + return result; + } /** * Reverses the given array in place. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java index 80d8cff5d..6b872a50c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.BitSet; + /* * Copyright (c) 2009 The Broad Institute * @@ -53,7 +55,40 @@ public interface Covariate { */ public CovariateValues getValues(GATKSAMRecord read); - public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration + /** + * Used to get the covariate's value from input csv file during on-the-fly recalibration + * + * @param str the key in string type (read from the csv) + * @return the key in it's correct type. + */ + public Object getValue(String str); + + /** + * Converts the bitset representation of the key (used internally for table indexing) to String format for file output. + * + * @param key the bitset representation of the key + * @return a string representation of the key + */ + public String keyFromBitSet(BitSet key); + + /** + * Converts a key into a bitset + * + * Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates + * the getValues method already returns all values in BitSet format. + * + * @param key the object corresponding to the covariate + * @return a bitset representation of the object + */ + public BitSet bitSetFromKey(Object key); + + /** + * Each covariate should determine how many bits are necessary to encode it's data + * + * @return The number of bits used to represent the values of this covariate. + */ + public int numberOfBits(); + } interface RequiredCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java deleted file mode 100644 index 1b62160a3..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CovariateKeySet.java +++ /dev/null @@ -1,88 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * The object temporarily held by a read that describes all of it's covariates. - * - * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap - * - * @author Mauricio Carneiro - * @since 2/8/12 - */ -public class CovariateKeySet { - private Object[][] mismatchesKeySet; - private Object[][] insertionsKeySet; - private Object[][] deletionsKeySet; - - private int nextCovariateIndex; - - private static String mismatchesCovariateName = "M"; - private static String insertionsCovariateName = "I"; - private static String deletionsCovariateName = "D"; - - public CovariateKeySet(int readLength, int numberOfCovariates) { - numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format) - this.mismatchesKeySet = new Object[readLength][numberOfCovariates]; - this.insertionsKeySet = new Object[readLength][numberOfCovariates]; - this.deletionsKeySet = new Object[readLength][numberOfCovariates]; - initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName); - initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName); - initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName); - this.nextCovariateIndex = 0; - } - - public void addCovariate(CovariateValues covariate) { - transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); - transposeCovariateValues(insertionsKeySet, covariate.getInsertions()); - transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); - nextCovariateIndex++; - } - - public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) { - if (modelString.equals(mismatchesCovariateName)) - return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION; - else if (modelString.equals(insertionsCovariateName)) - return RecalDataManager.BaseRecalibrationType.BASE_INSERTION; - else if (modelString.equals(deletionsCovariateName)) - return RecalDataManager.BaseRecalibrationType.BASE_DELETION; - throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString); - } - - public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) { - switch (errorModel) { - case BASE_SUBSTITUTION: - return getMismatchesKeySet(readPosition); - case BASE_INSERTION: - return getInsertionsKeySet(readPosition); - case BASE_DELETION: - return getDeletionsKeySet(readPosition); - default: - throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel ); - } - } - - public Object[] getMismatchesKeySet(int readPosition) { - return mismatchesKeySet[readPosition]; - } - - public Object[] getInsertionsKeySet(int readPosition) { - return insertionsKeySet[readPosition]; - } - - public Object[] getDeletionsKeySet(int readPosition) { - return deletionsKeySet[readPosition]; - } - - private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) { - for (int i=0; iMAX_CYCLE) ? null : BitSetUtils.bitSetFrom(cycle); cycle += increment; } } @@ -119,7 +100,7 @@ public class CycleCovariate implements StandardCovariate { // the current sequential model would consider the effects independently instead of jointly. final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - int cycle = multiplyByNegative1 ? -1 : 1; + short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // For example, AAAAAAA was probably read in two flow cycles but here we count it as one @@ -127,19 +108,19 @@ public class CycleCovariate implements StandardCovariate { int iii = 0; while (iii < readLength) { while (iii < readLength && bases[iii] == (byte) 'T') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii++; } while (iii < readLength && bases[iii] == (byte) 'A') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii++; } while (iii < readLength && bases[iii] == (byte) 'C') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii++; } while (iii < readLength && bases[iii] == (byte) 'G') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii++; } if (iii < readLength) { @@ -149,7 +130,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii++; } @@ -159,19 +140,19 @@ public class CycleCovariate implements StandardCovariate { int iii = readLength - 1; while (iii >= 0) { while (iii >= 0 && bases[iii] == (byte) 'T') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } while (iii >= 0 && bases[iii] == (byte) 'A') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } while (iii >= 0 && bases[iii] == (byte) 'C') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } while (iii >= 0 && bases[iii] == (byte) 'G') { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } if (iii >= 0) { @@ -181,7 +162,7 @@ public class CycleCovariate implements StandardCovariate { cycle++; } if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { - cycles[iii] = cycle; + cycles[iii] = BitSetUtils.bitSetFrom(cycle); iii--; } } @@ -192,13 +173,28 @@ public class CycleCovariate implements StandardCovariate { else { throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); } - + return new CovariateValues(cycles, cycles, cycles); } // Used to get the covariate's value from input csv file during on-the-fly recalibration @Override public final Object getValue(final String str) { - return Integer.parseInt(str); + return Short.parseShort(str); + } + + @Override + public String keyFromBitSet(BitSet key) { + return String.format("%d", BitSetUtils.shortFrom(key)); + } + + @Override + public BitSet bitSetFromKey(Object key) { + return (key instanceof String) ? BitSetUtils.bitSetFrom(Short.parseShort((String) key)) : BitSetUtils.bitSetFrom((Short) key); + } + + @Override + public int numberOfBits() { + return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java old mode 100755 new mode 100644 similarity index 65% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java index 233380820..779500512 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatumOptimized.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.List; - /* * Copyright (c) 2010 The Broad Institute * @@ -38,10 +36,13 @@ import java.util.List; * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ -public class RecalDatumOptimized { +public class Datum { + + long numObservations; // number of bases seen in total + long numMismatches; // number of bases seen that didn't match the reference + + private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero - protected long numObservations; // number of bases seen in total - protected long numMismatches; // number of bases seen that didn't match the reference //--------------------------------------------------------------------------------------------------------------- // @@ -49,19 +50,14 @@ public class RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public RecalDatumOptimized() { + public Datum() { numObservations = 0L; numMismatches = 0L; } - public RecalDatumOptimized(final long _numObservations, final long _numMismatches) { - numObservations = _numObservations; - numMismatches = _numMismatches; - } - - public RecalDatumOptimized(final RecalDatumOptimized copy) { - this.numObservations = copy.numObservations; - this.numMismatches = copy.numMismatches; + public Datum(long numObservations, long numMismatches) { + this.numObservations = numObservations; + this.numMismatches = numMismatches; } //--------------------------------------------------------------------------------------------------------------- @@ -70,46 +66,40 @@ public class RecalDatumOptimized { // //--------------------------------------------------------------------------------------------------------------- - public synchronized final void increment(final long incObservations, final long incMismatches) { + synchronized void increment(final long incObservations, final long incMismatches) { numObservations += incObservations; numMismatches += incMismatches; } - public synchronized final void increment(final RecalDatumOptimized other) { - increment(other.numObservations, other.numMismatches); - } - - public synchronized final void increment(final List data) { - for (RecalDatumOptimized other : data) { - this.increment(other); - } - } - //--------------------------------------------------------------------------------------------------------------- // // methods to derive empirical quality score // //--------------------------------------------------------------------------------------------------------------- - public final double empiricalQualDouble(final int smoothing, final double maxQual) { - final double doubleMismatches = (double) (numMismatches + smoothing); - final double doubleObservations = (double) (numObservations + smoothing); - double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(empiricalQual, maxQual); + double empiricalQualDouble() { + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + return Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } - public final byte empiricalQualByte(final int smoothing) { - final double doubleMismatches = (double) (numMismatches + smoothing); - final double doubleObservations = (double) (numObservations + smoothing); - return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 + byte empiricalQualByte() { + final double doubleMismatches = (double) (numMismatches); + final double doubleObservations = (double) (numObservations); + return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 } - public final byte empiricalQualByte() { - return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero - } - - public final String outputToCSV() { + @Override + public String toString() { return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); } + @Override + public boolean equals(Object o) { + if (!(o instanceof Datum)) + return false; + Datum other = (Datum) o; + return numMismatches == other.numMismatches && numObservations == other.numObservations; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java new file mode 100644 index 000000000..6d004edb1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +public enum EventType { + BASE_SUBSTITUTION(0, "M"), + BASE_INSERTION(1, "I"), + BASE_DELETION(2, "D"); + + public final int index; + private final String representation; + + private EventType(int index, String representation) { + this.index = index; + this.representation = representation; + } + + public static EventType eventFrom(int index) { + switch (index) { + case 0: + return BASE_SUBSTITUTION; + case 1: + return BASE_INSERTION; + case 2: + return BASE_DELETION; + default: + throw new ReviewedStingException(String.format("Event %d does not exist.", index)); + } + } + + public static EventType eventFrom(String event) { + for (EventType eventType : EventType.values()) + if (eventType.representation.equals(event)) + return eventType; + + throw new ReviewedStingException(String.format("Event %s does not exist.", event)); + } + + @Override + public String toString() { + return representation; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java index 373210bdb..4100eb8bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java @@ -1,7 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import org.broadinstitute.sting.utils.BitSetUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.BitSet; + /* * Copyright (c) 2009 The Broad Institute * @@ -46,18 +50,18 @@ public class QualityScoreCovariate implements RequiredCovariate { public CovariateValues getValues(final GATKSAMRecord read) { int readLength = read.getReadLength(); - Integer [] mismatches = new Integer[readLength]; - Integer [] insertions = new Integer[readLength]; - Integer [] deletions = new Integer[readLength]; + BitSet[] mismatches = new BitSet[readLength]; + BitSet[] insertions = new BitSet[readLength]; + BitSet[] deletions = new BitSet[readLength]; - byte [] baseQualities = read.getBaseQualities(); - byte [] baseInsertionQualities = read.getBaseInsertionQualities(); - byte [] baseDeletionQualities = read.getBaseDeletionQualities(); + byte[] baseQualities = read.getBaseQualities(); + byte[] baseInsertionQualities = read.getBaseInsertionQualities(); + byte[] baseDeletionQualities = read.getBaseDeletionQualities(); - for (int i=0; i quantizedQuals; + private List empiricalQualCounts; + private int quantizationLevels; + + private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { + this.quantizedQuals = quantizedQuals; + this.empiricalQualCounts = empiricalQualCounts; + this.quantizationLevels = quantizationLevels; + } + + public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { + this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); + } + + public QuantizationInfo(Map> keysAndTablesMap, int quantizationLevels) { + final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution + for (int i = 0; i < qualHistogram.length; i++) + qualHistogram[i] = 0L; + + Map qualTable = null; // look for the quality score table + for (Map.Entry> entry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = entry.getKey(); + if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites + qualTable = entry.getValue(); + } + + if (qualTable == null) + throw new ReviewedStingException("Could not find QualityScore table."); + + for (RecalDatum datum : qualTable.values()) { + int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + long nObservations = datum.numObservations; + qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key + } + empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities + quantizeQualityScores(quantizationLevels); + + this.quantizationLevels = quantizationLevels; + } + + + public void quantizeQualityScores(int nLevels) { + QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels + quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) + } + + public void noQuantization() { + this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE; + for (int i = 0; i < this.quantizationLevels; i++) + quantizedQuals.set(i, (byte) i); + } + + public List getQuantizedQuals() { + return quantizedQuals; + } + + public int getQuantizationLevels() { + return quantizationLevels; + } + + public GATKReportTable generateReportTable() { + GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map"); + quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + quantizedTable.addColumn(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, 0L); + quantizedTable.addColumn(RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, (byte) 0); + + for (int qual = 0; qual <= QualityUtils.MAX_QUAL_SCORE; qual++) { + quantizedTable.set(qual, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); + quantizedTable.set(qual, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); + } + return quantizedTable; + } + + private static int calculateQuantizationLevels(List quantizedQuals) { + byte lastByte = -1; + int quantizationLevels = 0; + for (byte q : quantizedQuals) { + if (q != lastByte) { + quantizationLevels++; + lastByte = q; + } + } + return quantizationLevels; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java new file mode 100644 index 000000000..74b759da5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.BitSet; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class ReadCovariates { + private final BitSet[][] mismatchesKeySet; + private final BitSet[][] insertionsKeySet; + private final BitSet[][] deletionsKeySet; + + private int nextCovariateIndex; + + public ReadCovariates(int readLength, int numberOfCovariates) { + this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates]; + this.insertionsKeySet = new BitSet[readLength][numberOfCovariates]; + this.deletionsKeySet = new BitSet[readLength][numberOfCovariates]; + this.nextCovariateIndex = 0; + } + + public void addCovariate(CovariateValues covariate) { + transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); + transposeCovariateValues(insertionsKeySet, covariate.getInsertions()); + transposeCovariateValues(deletionsKeySet, covariate.getDeletions()); + nextCovariateIndex++; + } + + public BitSet[] getKeySet(final int readPosition, final EventType errorModel) { + switch (errorModel) { + case BASE_SUBSTITUTION: + return getMismatchesKeySet(readPosition); + case BASE_INSERTION: + return getInsertionsKeySet(readPosition); + case BASE_DELETION: + return getDeletionsKeySet(readPosition); + default: + throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel); + } + } + + public BitSet[] getMismatchesKeySet(int readPosition) { + return mismatchesKeySet[readPosition]; + } + + public BitSet[] getInsertionsKeySet(int readPosition) { + return insertionsKeySet[readPosition]; + } + + public BitSet[] getDeletionsKeySet(int readPosition) { + return deletionsKeySet[readPosition]; + } + + private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) { + for (int i = 0; i < covariateValues.length; i++) + keySet[i][nextCovariateIndex] = covariateValues[i]; + } + + /** + * Testing routines + */ + protected BitSet[][] getMismatchesKeySet() { + return mismatchesKeySet; + } + + protected BitSet[][] getInsertionsKeySet() { + return insertionsKeySet; + } + + protected BitSet[][] getDeletionsKeySet() { + return deletionsKeySet; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index aecdd3d4b..579643f56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -1,8 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import org.broadinstitute.sting.utils.BitSetUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; +import java.util.BitSet; import java.util.HashMap; /* @@ -39,7 +42,7 @@ import java.util.HashMap; */ public class ReadGroupCovariate implements RequiredCovariate { - + private final HashMap readGroupLookupTable = new HashMap(); private final HashMap readGroupReverseLookupTable = new HashMap(); private short nextId = 0; @@ -52,30 +55,61 @@ public class ReadGroupCovariate implements RequiredCovariate { @Override public CovariateValues getValues(final GATKSAMRecord read) { final int l = read.getReadLength(); - final String readGroupId = read.getReadGroup().getReadGroupId(); + final String readGroupId = readGroupValueFromRG(read.getReadGroup()); + BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset + BitSet[] readGroups = new BitSet[l]; + Arrays.fill(readGroups, rg); + return new CovariateValues(readGroups, readGroups, readGroups); + } + + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public String keyFromBitSet(BitSet key) { + return decodeReadGroup((short) BitSetUtils.longFrom(key)); + } + + @Override + public BitSet bitSetFromKey(Object key) { + return bitSetForReadGroup((String) key); + } + + @Override + public int numberOfBits() { + return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE); + } + + private String decodeReadGroup(final short id) { + return readGroupReverseLookupTable.get(id); + } + + private BitSet bitSetForReadGroup(String readGroupId) { short shortId; - if (readGroupLookupTable.containsKey(readGroupId)) + if (readGroupLookupTable.containsKey(readGroupId)) shortId = readGroupLookupTable.get(readGroupId); else { shortId = nextId; readGroupLookupTable.put(readGroupId, nextId); readGroupReverseLookupTable.put(nextId, readGroupId); nextId++; - } - Short [] readGroups = new Short[l]; - Arrays.fill(readGroups, shortId); - return new CovariateValues(readGroups, readGroups, readGroups); + } + return BitSetUtils.bitSetFrom(shortId); } - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return str; + /** + * If the sample has a PU tag annotation, return that. If not, return the read group id. + * + * @param rg the read group record + * @return platform unit or readgroup id + */ + private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) { + String platformUnit = rg.getPlatformUnit(); + return platformUnit == null ? rg.getId() : platformUnit; } - public final String decodeReadGroup(final short id) { - return readGroupReverseLookupTable.get(id); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java index cc60ac010..53e7c3f35 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java @@ -25,22 +25,26 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -53,24 +57,31 @@ import java.util.Map; */ public class RecalDataManager { - public final NestedHashMap nestedHashMap; // The full dataset - private final HashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed - private final HashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - private final HashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; + public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; + public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; + public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; + public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams - public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; + public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; + public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; + public final static String READGROUP_COLUMN_NAME = "ReadGroup"; + public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; + public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; + public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; + public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; + public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; + public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; + public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; + public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; + + private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private static boolean warnUserNullPlatform = false; - private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ + private static final String SCRIPT_FILE = "BQSR.R"; - public enum BaseRecalibrationType { - BASE_SUBSTITUTION, - BASE_INSERTION, - BASE_DELETION - } public enum SOLID_RECAL_MODE { /** @@ -88,7 +99,20 @@ public class RecalDataManager { /** * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */ - REMOVE_REF_BIAS + REMOVE_REF_BIAS; + + public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { + if (recalMode.equals("DO_NOTHING")) + return SOLID_RECAL_MODE.DO_NOTHING; + if (recalMode.equals("SET_Q_ZERO")) + return SOLID_RECAL_MODE.SET_Q_ZERO; + if (recalMode.equals("SET_Q_ZERO_BASE_N")) + return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; + if (recalMode.equals("REMOVE_REF_BIAS")) + return SOLID_RECAL_MODE.REMOVE_REF_BIAS; + + throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); + } } public enum SOLID_NOCALL_STRATEGY { @@ -103,175 +127,348 @@ public class RecalDataManager { /** * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */ - PURGE_READ - } + PURGE_READ; - public RecalDataManager() { - nestedHashMap = new NestedHashMap(); - dataCollapsedReadGroup = null; - dataCollapsedQualityScore = null; - dataCollapsedByCovariate = null; - } + public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { + if (nocallStrategy.equals("THROW_EXCEPTION")) + return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) + return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; + if (nocallStrategy.equals("PURGE_READ")) + return SOLID_NOCALL_STRATEGY.PURGE_READ; - public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) { - if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration - nestedHashMap = null; - dataCollapsedReadGroup = new HashMap(); - dataCollapsedQualityScore = new HashMap(); - dataCollapsedByCovariate = new HashMap>(); - for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { - dataCollapsedReadGroup.put(errorModel, new NestedHashMap()); - dataCollapsedQualityScore.put(errorModel, new NestedHashMap()); - dataCollapsedByCovariate.put(errorModel, new ArrayList()); - for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap()); - } - } - } - else { - nestedHashMap = new NestedHashMap(); - dataCollapsedReadGroup = null; - dataCollapsedQualityScore = null; - dataCollapsedByCovariate = null; + throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); } } - public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) { - return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE); - } - + /** - * Add the given mapping to all of the collapsed hash tables + * Initializes the recalibration table -> key manager map * - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping - * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table + * @param requiredCovariates list of required covariates (in order) + * @param optionalCovariates list of optional covariates (in order) + * @return a map with each key manager and it's corresponding recalibration table properly initialized */ - public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) { - - // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around - //data.put(key, thisDatum); // add the mapping to the main table - - final int qualityScore = Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; - RecalDatum collapsedDatum; - - // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed - if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) { - readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey); - if (collapsedDatum == null) { - dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey); - } - else { - collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported - } + public static LinkedHashMap> initializeTables(ArrayList requiredCovariates, ArrayList optionalCovariates) { + final LinkedHashMap> tablesAndKeysMap = new LinkedHashMap>(); + ArrayList requiredCovariatesToAdd = new ArrayList(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. + ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables + for (Covariate covariate : requiredCovariates) { + requiredCovariatesToAdd.add(covariate); + final Map recalTable = new HashMap(); // initializing a new recal table for each required covariate (cumulatively) + final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager + tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map } + final Map recalTable = new HashMap(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates + final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager + tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map + return tablesAndKeysMap; + } - // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed - qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... - qualityScoreCollapsedKey[1] = key[1]; // and quality score - collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey); - if (collapsedDatum == null) { - dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); - } - else { - collapsedDatum.increment(fullDatum); - } + /** + * Generates two lists : required covariates and optional covariates based on the user's requests. + * + * Performs the following tasks in order: + * 1. Adds all requierd covariates in order + * 2. Check if the user asked to use the standard covariates and adds them all if that's the case + * 3. Adds all covariates requested by the user that were not already added by the two previous steps + * + * @param argumentCollection the argument collection object for the recalibration walker + * @return a pair of ordered lists : required covariates (first) and optional covariates (second) + */ + public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) { - covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... - covariateCollapsedKey[1] = key[1]; // and quality score ... - final Object theCovariateElement = key[iii + 2]; // and the given covariate - if (theCovariateElement != null) { - covariateCollapsedKey[2] = theCovariateElement; - collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey); - if (collapsedDatum == null) { - dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); + ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + ArrayList optionalCovariates = new ArrayList(); + if (argumentCollection.USE_STANDARD_COVARIATES) + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + + if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified + for (String requestedCovariateString : argumentCollection.COVARIATES) { + boolean foundClass = false; + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + foundClass = true; + if (!requiredClasses.contains(covClass) && + (!argumentCollection.USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { + try { + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + optionalCovariates.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } } - else { - collapsedDatum.increment(fullDatum); + + if (!foundClass) { + throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); } } } + return new Pair, ArrayList>(requiredCovariates, optionalCovariates); + } + + public static void listAvailableCovariates(Logger logger) { + // Get a list of all available covariates + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + + // Print and exit if that's what was requested + logger.info("Available covariates:"); + for (Class covClass : covariateClasses) + logger.info(covClass.getSimpleName()); + logger.info(""); + } + + private static List generateReportTables(Map> keysAndTablesMap) { + List result = new LinkedList(); + int tableIndex = 0; + + final Pair covariateValue = new Pair(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s"); + final Pair covariateName = new Pair(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s"); + final Pair eventType = new Pair(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s"); + final Pair empiricalQuality = new Pair(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + final Pair estimatedQReported = new Pair(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + final Pair nObservations = new Pair(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + final Pair nErrors = new Pair(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); + + for (Map.Entry> entry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = entry.getKey(); + Map recalTable = entry.getValue(); + + boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs. + GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, ""); + + List requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table + List optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table + + ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + + for (Covariate covariate : requiredList) { + String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order + columnNames.add(new Pair(name, "%s")); // save the required covariate name so we can reference it in the future + } + + if (optionalList.size() > 0) { + columnNames.add(covariateValue); + columnNames.add(covariateName); + } + + columnNames.add(eventType); // the order of these column names is important here + columnNames.add(empiricalQuality); + if (isReadGroupTable) + columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported + columnNames.add(nObservations); + columnNames.add(nErrors); + + reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden) + for (Pair columnName : columnNames) + reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type + + long primaryKey = 0L; + + for (Map.Entry recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys + BitSet bitSetKey = recalTableEntry.getKey(); + Map columnData = new HashMap(columnNames.size()); + Iterator> iterator = columnNames.iterator(); + for (Object key : keyManager.keySetFrom(bitSetKey)) { + String columnName = iterator.next().getFirst(); + columnData.put(columnName, key); + } + RecalDatum datum = recalTableEntry.getValue(); + columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); + if (isReadGroupTable) + columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + columnData.put(iterator.next().getFirst(), datum.numObservations); + columnData.put(iterator.next().getFirst(), datum.numMismatches); + + for (Map.Entry dataEntry : columnData.entrySet()) { + String columnName = dataEntry.getKey(); + Object value = dataEntry.getValue(); + reportTable.set(primaryKey, columnName, value.toString()); + } + primaryKey++; + } + result.add(reportTable); + } + return result; + } + + public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map> keysAndTablesMap, PrintStream outputFile) { + outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); + } + + public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, PrintStream outputFile) { + outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); + } + + private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List recalTables, PrintStream outputFile) { + GATKReport report = new GATKReport(); + report.addTable(argumentTable); + report.addTable(quantizationTable); + report.addTables(recalTables); + report.print(outputFile); + } + + private static Pair initializeRecalibrationPlot(File filename) { + final PrintStream deltaTableStream; + final File deltaTableFileName = new File(filename + ".csv"); + try { + deltaTableStream = new PrintStream(deltaTableFileName); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created"); + } + return new Pair(deltaTableStream, deltaTableFileName); + } + + private static void outputRecalibrationPlot(Pair files, boolean keepIntermediates) { + final File csvFileName = files.getSecond(); + final File plotFileName = new File(csvFileName + ".pdf"); + files.getFirst().close(); + + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class)); + executor.addArgs(csvFileName.getAbsolutePath()); + executor.addArgs(plotFileName.getAbsolutePath()); + executor.exec(); + + if (!keepIntermediates) + if (!csvFileName.delete()) + throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath()); + + } + + public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, boolean keepIntermediates) { + Pair files = initializeRecalibrationPlot(filename); + writeCSV(files.getFirst(), original, "ORIGINAL", true); + outputRecalibrationPlot(files, keepIntermediates); + } + + public static void generateRecalibrationPlot(File filename, LinkedHashMap> original, LinkedHashMap> recalibrated, boolean keepIntermediates) { + Pair files = initializeRecalibrationPlot(filename); + writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true); + writeCSV(files.getFirst(), original, "ORIGINAL", false); + outputRecalibrationPlot(files, keepIntermediates); + } + + private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap> map, String recalibrationMode, boolean printHeader) { + final int QUALITY_SCORE_COVARIATE_INDEX = 1; + final Map deltaTable = new HashMap(); + BQSRKeyManager deltaKeyManager = null; + + + for (Map.Entry> tableEntry : map.entrySet()) { + BQSRKeyManager keyManager = tableEntry.getKey(); + + if (keyManager.getOptionalCovariates().size() > 0) { // initialize with the 'all covariates' table + // create a key manager for the delta table + List requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate + List optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate + optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates + deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager + } + } + + if (deltaKeyManager == null) + throw new ReviewedStingException ("Couldn't find the covariates table"); + + boolean readyToPrint = false; + for (Map.Entry> tableEntry : map.entrySet()) { + BQSRKeyManager keyManager = tableEntry.getKey(); + + if (keyManager.getRequiredCovariates().size() == 2 && keyManager.getOptionalCovariates().isEmpty()) { // look for the QualityScore table + Map table = tableEntry.getValue(); + + // add the quality score table to the delta table + for (Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table + RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) + + List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key + List newCovs = new ArrayList(4); + newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score + newCovs.add(1, covs.get(1)); + newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) + newCovs.add(3, covs.get(2)); + BitSet deltaKey = deltaKeyManager.bitSetFromKey(newCovs.toArray()); // create a new bitset key for the delta table + addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table + } + } + + else if (keyManager.getOptionalCovariates().size() > 0) { // look for the optional covariates table + Map table = tableEntry.getValue(); + + // add the optional covariates to the delta table + for (Map.Entry entry : table.entrySet()) { // go through every element in the covariates table to create the delta table + RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) + + List covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key + covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table + addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table + } + readyToPrint = true; + } + + // output the csv file + if (readyToPrint) { + + if (printHeader) { + List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + deltaTableFile.println(Utils.join(",", header)); + } + + // print each data line + for(Map.Entry deltaEntry : deltaTable.entrySet()) { + List deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey()); + RecalDatum deltaDatum = deltaEntry.getValue(); + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.stringForCSV()); + deltaTableFile.println("," + recalibrationMode); + } + + } + + } } /** - * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score - * that will be used in the sequential calculation in TableRecalibrationWalker + * Updates the current RecalDatum element in the delta table. * - * @param smoothing The smoothing parameter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores - */ - public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { - - for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) { - recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual); - recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual); - for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) { - recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); - checkForSingletons(map.data); - } - } - } - - private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) { - - for (Object comp : data.keySet()) { - final Object val = data.get(comp); - if (val instanceof RecalDatum) { // We are at the end of the nested hash maps - ((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual); - } - else { // Another layer in the nested hash map - recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual); - } - } - } - - private void checkForSingletons(final Map data) { - // todo -- this looks like it's better just as a data.valueSet() call? - for (Object comp : data.keySet()) { - final Object val = data.get(comp); - if (val instanceof RecalDatum) { // We are at the end of the nested hash maps - if (data.keySet().size() == 1) { - data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ... - // in a previous step of the sequential calculation model - } - } - else { // Another layer in the nested hash map - checkForSingletons((Map) val); - } - } - } - - /** - * Get the appropriate collapsed table out of the set of all the tables held by this Object + * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. * - * @param covariate Which covariate indexes the desired collapsed HashMap - * @return The desired collapsed HashMap + * @param deltaTable the delta table + * @param deltaKey the key to the table + * @param recalDatum the recal datum to combine with the accuracyDatum element in the table */ - public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) { - if (covariate == 0) { - return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed - } - else if (covariate == 1) { - return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed - } - else { - return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed - } + private static void addToDeltaTable(Map deltaTable, BitSet deltaKey, RecalDatum recalDatum) { + RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + if (deltaDatum == null) + deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum + else + deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. } + /** * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string * * @param read The read to adjust * @param RAC The list of shared command line arguments */ - public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { GATKSAMReadGroupRecord readGroup = read.getReadGroup(); if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { @@ -295,262 +492,53 @@ public class RecalDataManager { } /** - * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space + * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are + * inconsistent with the color space. If there is no call in the color space, this method returns true meaning + * this read should be skipped * - * @param read The SAMRecord to parse + * @param strategy the strategy used for SOLID no calls + * @param read The SAMRecord to parse + * @return whether or not this read should be skipped */ - public static void parseColorSpace(final GATKSAMRecord read) { - - // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if (ReadUtils.isSOLiDRead(read)) { - if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read + public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { + if (ReadUtils.isSOLiDRead(read)) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { byte[] colorSpace; - if (attr instanceof String) { + if (attr instanceof String) colorSpace = ((String) attr).getBytes(); - } - else { + else throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - byte[] readBases = read.getReadBases(); - if (read.getReadNegativeStrandFlag()) { + + byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + if (read.getReadNegativeStrandFlag()) readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - } + final byte[] inconsistency = new byte[readBases.length]; - int iii; - byte prevBase = colorSpace[0]; // The sentinel - for (iii = 0; iii < readBases.length; iii++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); - inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1); - prevBase = readBases[iii]; + int i; + byte prevBase = colorSpace[0]; // The sentinel + for (i = 0; i < readBases.length; i++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); + inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); + prevBase = readBases[i]; } read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); + } + else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } + else + return true; // otherwise, just skip the read } } - } - - /** - * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases - * This method doesn't add the inconsistent tag to the read like parseColorSpace does - * - * @param read The SAMRecord to parse - * @param originalQualScores The array of original quality scores to modify during the correction - * @param solidRecalMode Which mode of solid recalibration to apply - * @param refBases The reference for this read - * @return A new array of quality scores that have been ref bias corrected - */ - public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) { - - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) { - colorSpace = ((String) attr).getBytes(); - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - byte[] readBases = read.getReadBases(); - final byte[] colorImpliedBases = readBases.clone(); - byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases - if (read.getReadNegativeStrandFlag()) { - readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone()); - } - final int[] inconsistency = new int[readBases.length]; - byte prevBase = colorSpace[0]; // The sentinel - for (int iii = 0; iii < readBases.length; iii++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); - colorImpliedBases[iii] = thisBase; - inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1); - prevBase = readBases[iii]; - } - - // Now that we have the inconsistency array apply the desired correction to the inconsistent bases - if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0 - final boolean setBaseN = false; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } - else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) { - final boolean setBaseN = true; - originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } - else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases - solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); - } - - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - - return originalQualScores; - } - - public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { - if (ReadUtils.isSOLiDRead(read)) { - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) { - colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - } - - for (byte color : colorSpace) { - if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { - return true; // There is a bad color in this SOLiD read and the user wants to skip over it - } - } - - } - else { - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - } - } - - return false; // There aren't any color no calls in this SOLiD read - } - - /** - * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero - * - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color - * @param originalQualScores The array of original quality scores to set to zero if needed - * @param refBases The reference which has been RC'd if necessary - * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar - * @return The byte array of original quality scores some of which might have been set to zero - */ - private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) { - - final boolean negStrand = read.getReadNegativeStrandFlag(); - for (int iii = 1; iii < originalQualScores.length; iii++) { - if (inconsistency[iii] == 1) { - if (readBases[iii] == refBases[iii]) { - if (negStrand) { - originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0; - } - else { - originalQualScores[iii] = (byte) 0; - } - if (setBaseN) { - readBases[iii] = (byte) 'N'; - } - } - // Set the prev base to Q0 as well - if (readBases[iii - 1] == refBases[iii - 1]) { - if (negStrand) { - originalQualScores[originalQualScores.length - iii] = (byte) 0; - } - else { - originalQualScores[iii - 1] = (byte) 0; - } - if (setBaseN) { - readBases[iii - 1] = (byte) 'N'; - } - } - } - } - if (negStrand) { - readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read - } - read.setReadBases(readBases); - - return originalQualScores; - } - - /** - * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference - * - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color - * @param colorImpliedBases The bases implied by the color space, RC'd if necessary - * @param refBases The reference which has been RC'd if necessary - */ - private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) { - - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpaceQuals; - if (attr instanceof String) { - String x = (String) attr; - colorSpaceQuals = x.getBytes(); - SAMUtils.fastqToPhred(colorSpaceQuals); - } - else { - throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - - for (int iii = 1; iii < inconsistency.length - 1; iii++) { - if (inconsistency[iii] == 1) { - for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read - if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step - if (readBases[jjj] == refBases[jjj]) { - if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2); - if (rand == 0) { // The color implied base won the coin flip - readBases[jjj] = colorImpliedBases[jjj]; - } - } - else { - final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); - final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); - int diffInQuality = maxQuality - minQuality; - int numLow = minQuality; - if (numLow == 0) { - numLow++; - diffInQuality++; - } - final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh); - if (rand >= numLow) { // higher q score won - if (maxQuality == (int) colorSpaceQuals[jjj]) { - readBases[jjj] = colorImpliedBases[jjj]; - } // else ref color had higher q score, and won out, so nothing to do here - } - else { // lower q score won - if (minQuality == (int) colorSpaceQuals[jjj]) { - readBases[jjj] = colorImpliedBases[jjj]; - } // else ref color had lower q score, and won out, so nothing to do here - } - } - } - } - } - } - } - - if (read.getReadNegativeStrandFlag()) { - readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read - } - read.setReadBases(readBases); - } - else { // No color space quality tag in file - throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); - } + return false; } /** * Given the base and the color calculate the next base in the sequence * + * @param read the read * @param prevBase The base * @param color The color * @return The next base in the sequence @@ -578,16 +566,16 @@ public class RecalDataManager { * @param offset The offset in the read at which to check * @return Returns true if the base was inconsistent with the color space */ - public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) { + public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); if (attr != null) { final byte[] inconsistency = (byte[]) attr; // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! if (read.getReadNegativeStrandFlag()) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] != (byte) 0; + return inconsistency[inconsistency.length - offset - 1] == (byte) 0; } else { // Forward direction - return inconsistency[offset] != (byte) 0; + return inconsistency[offset] == (byte) 0; } // This block of code is for if you want to check both the offset and the next base for color space inconsistency @@ -607,7 +595,7 @@ public class RecalDataManager { } else { // No inconsistency array, so nothing is inconsistent - return false; + return true; } } @@ -615,22 +603,24 @@ public class RecalDataManager { * Computes all requested covariates for every offset in the given read * by calling covariate.getValues(..). * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * * @param read The read for which to compute covariate values. * @param requestedCovariates The list of requested covariates. - * @return An array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. + * @return a matrix with all the covariates calculated for every base in the read */ - public static void computeCovariates(final GATKSAMRecord read, final List requestedCovariates) { + public static ReadCovariates computeCovariates(final GATKSAMRecord read, final List requestedCovariates) { final int numRequestedCovariates = requestedCovariates.size(); final int readLength = read.getReadLength(); - final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates); + final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates); // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read for (Covariate covariate : requestedCovariates) - covariateKeySet.addCovariate(covariate.getValues(read)); + readCovariates.addCovariate(covariate.getValues(read)); - read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet); + return readCovariates; } /** @@ -707,4 +697,42 @@ public class RecalDataManager { return base; } } + + + /** + * Adds the required covariates to a covariate list + * + * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addRequiredCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + if (classes.size() != 2) + throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); + + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new QualityScoreCovariate()); + return dest; + } + + /** + * Adds the standard covariates to a covariate list + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addStandardCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + for (Class covClass : classes) { + try { + final Covariate covariate = (Covariate) covClass.newInstance(); + dest.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + return dest; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 91f865180..3eb3a3981 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -25,6 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Random; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -33,10 +37,11 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ -public class RecalDatum extends RecalDatumOptimized { +public class RecalDatum extends Datum { + + private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations + private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) - private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations - private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) //--------------------------------------------------------------------------------------------------------------- // @@ -48,7 +53,7 @@ public class RecalDatum extends RecalDatumOptimized { numObservations = 0L; numMismatches = 0L; estimatedQReported = 0.0; - empiricalQuality = 0.0; + empiricalQuality = -1.0; } public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) { @@ -65,48 +70,81 @@ public class RecalDatum extends RecalDatumOptimized { this.empiricalQuality = copy.empiricalQuality; } - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final void combine(final RecalDatum other) { + public void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); this.increment(other.numObservations, other.numMismatches); - this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations); - //if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } + this.estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations); + this.empiricalQuality = -1.0; // reset the empirical quality calculation so we never have a wrongly calculated empirical quality stored } - //--------------------------------------------------------------------------------------------------------------- - // - // methods to derive empirical quality score - // - //--------------------------------------------------------------------------------------------------------------- - - public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) { - this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again + public final void calcCombinedEmpiricalQuality() { + this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again + } + + public final void calcEstimatedReportedQuality() { + this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / numObservations); } - - //--------------------------------------------------------------------------------------------------------------- - // - // misc. methods - // - //--------------------------------------------------------------------------------------------------------------- public final double getEstimatedQReported() { return estimatedQReported; } public final double getEmpiricalQuality() { + if (empiricalQuality < 0) + calcCombinedEmpiricalQuality(); return empiricalQuality; } - private double calcExpectedErrors() { + /** + * Makes a hard copy of the recal datum element + * + * @return a new recal datum object with the same contents of this datum. + */ + public RecalDatum copy() { + return new RecalDatum(numObservations, numMismatches, estimatedQReported, empiricalQuality); + } + + @Override + public String toString() { + return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); + } + + public String stringForCSV() { + return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); + } + + + private double calcExpectedErrors() { return (double) this.numObservations * qualToErrorProb(estimatedQReported); } private double qualToErrorProb(final double qual) { return Math.pow(10.0, qual / -10.0); } + + public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + Random random = new Random(); + int nObservations = random.nextInt(maxObservations); + int nErrors = random.nextInt(maxErrors); + Datum datum = new Datum(nObservations, nErrors); + double empiricalQuality = datum.empiricalQualDouble(); + double estimatedQReported = empiricalQuality + ((10 * random.nextDouble()) - 5); // empirical quality +/- 5. + return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); + } + + /** + * We don't compare the estimated quality reported because it may be different when read from + * report tables. + * + * @param o the other recal datum + * @return true if the two recal datums have the same number of observations, errors and empirical quality. + */ + @Override + public boolean equals(Object o) { + if (!(o instanceof RecalDatum)) + return false; + RecalDatum other = (RecalDatum) o; + return super.equals(o) && + MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index cc6f67cc9..598312916 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.Utils; -import java.io.PrintStream; -import java.util.ArrayList; +import java.io.File; import java.util.Collections; import java.util.List; @@ -52,7 +52,7 @@ public class RecalibrationArgumentCollection { * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. */ @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) - protected List> knownSites = Collections.emptyList(); + public List> knownSites = Collections.emptyList(); /** * After the header, data records occur one per line until the end of the file. The first several items on a line are the @@ -60,27 +60,27 @@ public class RecalibrationArgumentCollection { * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, * and the raw empirical quality score calculated by phred-scaling the mismatch rate. */ - @Gather(CountCovariatesGatherer.class) + @Gather(BQSRGatherer.class) @Output - protected PrintStream RECAL_FILE; + public File RECAL_FILE; /** * List all implemented covariates. */ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) - protected boolean LIST_ONLY = false; + public boolean LIST_ONLY = false; /** * Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list. */ @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false) - protected String[] COVARIATES = null; + public String[] COVARIATES = null; /* * Use the standard set of covariates in addition to the ones listed using the -cov argument */ @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false) - protected boolean USE_STANDARD_COVARIATES = true; + public boolean USE_STANDARD_COVARIATES = true; ///////////////////////////// // Debugging-only Arguments @@ -90,17 +90,7 @@ public class RecalibrationArgumentCollection { */ @Hidden @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") - protected boolean RUN_WITHOUT_DBSNP = false; - - ///////////////////////////// - // protected Member Variables - ///////////////////////////// - protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables) - protected final ArrayList requestedCovariates = new ArrayList();// A list to hold the covariate objects that were requested - - protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. - protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. - + public boolean RUN_WITHOUT_DBSNP = false; /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the @@ -153,6 +143,19 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) public byte DELETIONS_DEFAULT_QUALITY = 45; + /** + * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality + */ + @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) + public byte LOW_QUAL_TAIL = 2; + + /** + * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. + * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. + */ + @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") + public int QUANTIZING_LEVELS = 16; + @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") @@ -160,6 +163,37 @@ public class RecalibrationArgumentCollection { @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden + @Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots") + public boolean KEEP_INTERMEDIATE_FILES = false; + @Hidden + @Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering") + public boolean NO_PLOTS = false; + public File recalibrationReport = null; + + public GATKReportTable generateReportTable() { + GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run"); + argumentsTable.addPrimaryKey("Argument"); + argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, "null"); + argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES)); + argumentsTable.set("standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, USE_STANDARD_COVARIATES); + argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); + argumentsTable.set("solid_recal_mode", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); + argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); + argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); + argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE); + argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE); + argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); + argumentsTable.set("insertions_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); + argumentsTable.set("low_quality_tail", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); + argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); + argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); + argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); + argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); + argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); + argumentsTable.set("recalibration_report", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); + return argumentsTable; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java new file mode 100644 index 000000000..febbc1280 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java @@ -0,0 +1,359 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * This class has all the static functionality for reading a recalibration report file into memory. + * + * @author carneiro + * @since 3/26/12 + */ +public class RecalibrationReport { + private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) + private final LinkedHashMap> keysAndTablesMap; // quick access reference to the read group table and its key manager + private final ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation + + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + + public RecalibrationReport(final File RECAL_FILE) { + GATKReport report = new GATKReport(RECAL_FILE); + + argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE); + RAC = initializeArgumentCollectionTable(argumentTable); + + GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE); + quantizationInfo = initializeQuantizationTable(quantizedTable); + + Pair, ArrayList> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates + ArrayList requiredCovariates = covariates.getFirst(); + ArrayList optionalCovariates = covariates.getSecond(); + requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates + requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates + + for (Covariate cov : requestedCovariates) + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + + keysAndTablesMap = new LinkedHashMap>(); + ArrayList requiredCovariatesToAdd = new ArrayList(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. + ArrayList optionalCovariatesToAdd = new ArrayList(); // initialize an empty array of optional covariates to create the first few tables + for (Covariate covariate : requiredCovariates) { + requiredCovariatesToAdd.add(covariate); + final Map table; // initializing a new recal table for each required covariate (cumulatively) + final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager + + int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES) + final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; + if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table + final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); + table = parseReadGroupTable(keyManager, reportTable); + } + else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table + final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE); + table = parseQualityScoreTable(keyManager, reportTable); + } + else + throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION); + + keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map + } + + + final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager + final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE); + final Map table = parseAllCovariatesTable(keyManager, reportTable); + keysAndTablesMap.put(keyManager, table); + } + + protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) { + this.quantizationInfo = quantizationInfo; + this.keysAndTablesMap = keysAndTablesMap; + this.argumentTable = argumentTable; + this.RAC = RAC; + } + + /** + * Combines two recalibration reports by adding all observations and errors + * + * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate + * them after combining. The reason for not calculating it is because this function is inteded for combining a + * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized + * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, + * makes this method faster + * + * Note2: The empirical quality reported, however, is recalculated given its simplicity. + * + * @param other the recalibration report to combine with this one + */ + public void combine(RecalibrationReport other) { + Iterator>> thisIterator = keysAndTablesMap.entrySet().iterator(); + + for (Map.Entry> otherEntry : other.getKeysAndTablesMap().entrySet()) { + Map.Entry> thisEntry = thisIterator.next(); + + Map thisTable = thisEntry.getValue(); + BQSRKeyManager thisKeyManager = thisEntry.getKey(); + BQSRKeyManager otherKeyManager = otherEntry.getKey(); + + for (Map.Entry otherTableEntry : otherEntry.getValue().entrySet()) { + RecalDatum otherDatum = otherTableEntry.getValue(); + BitSet otherBitKey = otherTableEntry.getKey(); + List otherObjectKey = otherKeyManager.keySetFrom(otherBitKey); + + BitSet thisBitKey = thisKeyManager.bitSetFromKey(otherObjectKey.toArray()); + RecalDatum thisDatum = thisTable.get(thisBitKey); + + if (thisDatum == null) + thisTable.put(thisBitKey, otherDatum); + else + thisDatum.combine(otherDatum); + } + } + } + + public QuantizationInfo getQuantizationInfo() { + return quantizationInfo; + } + + public LinkedHashMap> getKeysAndTablesMap() { + return keysAndTablesMap; + } + + public ArrayList getRequestedCovariates() { + return requestedCovariates; + } + + /** + * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table + * + * @param keyManager the key manager for this table + * @param reportTable the GATKReport table containing data for this table + * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. + */ + private Map parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { + ArrayList columnNamesOrderedList = new ArrayList(5); + columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); + return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false); + } + + /** + * + * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table + * @param keyManager the key manager for this table + * @param reportTable the GATKReport table containing data for this table + * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. + */ + private Map parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { + ArrayList columnNamesOrderedList = new ArrayList(3); + columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); + return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false); + } + + /** + * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table + * + * @param keyManager the key manager for this table + * @param reportTable the GATKReport table containing data for this table + * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. + */ + private Map parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { + ArrayList columnNamesOrderedList = new ArrayList(2); + columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); + columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); + return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, true); + } + + /** + * Shared parsing functionality for all tables. + * + * @param keyManager the key manager for this table + * @param reportTable the GATKReport table containing data for this table + * @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table + * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. + */ + private Map genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList columnNamesOrderedList, boolean hasEstimatedQReportedColumn) { + Map result = new HashMap(reportTable.getNumRows()*2); + + for (Object primaryKey : reportTable.getPrimaryKeys()) { + int nKeys = columnNamesOrderedList.size(); + Object [] keySet = new Object[nKeys]; + for (int i = 0; i < nKeys; i++) + keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below) + keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager). + BitSet bitKey = keyManager.bitSetFromKey(keySet); + + long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); + long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); + double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); + + double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table + (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(primaryKey, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + + RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); + result.put(bitKey, recalDatum); + } + return result; + } + + /** + * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores + * + * @param table the GATKReportTable containing the quantization mappings + * @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE + */ + private QuantizationInfo initializeQuantizationTable(GATKReportTable table) { + Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1]; + Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1]; + for (Object primaryKey : table.getPrimaryKeys()) { + Object quantizedObject = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME); + Object countObject = table.get(primaryKey, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME); + byte originalQual = Byte.parseByte(primaryKey.toString()); + byte quantizedQual = Byte.parseByte(quantizedObject.toString()); + long quantizedCount = Long.parseLong(countObject.toString()); + quals[originalQual] = quantizedQual; + counts[originalQual] = quantizedCount; + } + return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts)); + } + + /** + * Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values + * + * @param table the GATKReportTable containing the arguments and its corresponding values + * @return a RAC object properly initialized with all the objects in the table + */ + private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) { + RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + for (Object primaryKey : table.getPrimaryKeys()) { + Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME); + if (value.equals("null")) + value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport + + if (primaryKey.equals("covariate") && value != null) + RAC.COVARIATES = value.toString().split(","); + + else if (primaryKey.equals("standard_covs")) + RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value); + + else if (primaryKey.equals("solid_recal_mode")) + RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value); + + else if (primaryKey.equals("solid_nocall_strategy")) + RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); + + else if (primaryKey.equals("mismatches_context_size")) + RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (primaryKey.equals("insertions_context_size")) + RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (primaryKey.equals("deletions_context_size")) + RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (primaryKey.equals("mismatches_default_quality")) + RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (primaryKey.equals("insertions_default_quality")) + RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (primaryKey.equals("deletions_default_quality")) + RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (primaryKey.equals("low_quality_tail")) + RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value); + + else if (primaryKey.equals("default_platform")) + RAC.DEFAULT_PLATFORM = (String) value; + + else if (primaryKey.equals("force_platform")) + RAC.FORCE_PLATFORM = (String) value; + + else if (primaryKey.equals("quantizing_levels")) + RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); + + else if (primaryKey.equals("keep_intermediate_files")) + RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value); + + else if (primaryKey.equals("no_plots")) + RAC.NO_PLOTS = Boolean.parseBoolean((String) value); + + else if (primaryKey.equals("recalibration_report")) + RAC.recalibrationReport = (value == null) ? null : new File((String) value); + } + + return RAC; + } + + /** + * this functionality avoids recalculating the empirical qualities, estimated reported quality + * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. + */ + public void calculateEmpiricalAndQuantizedQualities() { + for (Map table : keysAndTablesMap.values()) + for (RecalDatum datum : table.values()) + datum.calcCombinedEmpiricalQuality(); + + quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS); + } + + public void output(PrintStream output) { + RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output); + } + + public RecalibrationArgumentCollection getRAC() { + return RAC; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof RecalibrationReport)) + return false; + RecalibrationReport other = (RecalibrationReport) o; + if (this == o) + return true; + return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap); + } + + private boolean isEqualTable(LinkedHashMap> t1, LinkedHashMap> t2) { + if (t1.size() != t2.size()) + return false; + + Iterator>> t1Iterator = t1.entrySet().iterator(); + Iterator>> t2Iterator = t2.entrySet().iterator(); + + while (t1Iterator.hasNext() && t2Iterator.hasNext()) { + Map.Entry> t1MapEntry = t1Iterator.next(); + Map.Entry> t2MapEntry = t2Iterator.next(); + + if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey()))) + return false; + + Map table2 = t2MapEntry.getValue(); + for (Map.Entry t1TableEntry : t1MapEntry.getValue().entrySet()) { + BitSet t1Key = t1TableEntry.getKey(); + if (!table2.containsKey(t1Key)) + return false; + RecalDatum t1Datum = t1TableEntry.getValue(); + if (!t1Datum.equals(table2.get(t1Key))) + return false; + } + } + return true; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 1dfc6fea0..2a8940de0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -1,23 +1,25 @@ /* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * * OTHER DEALINGS IN THE SOFTWARE. + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.coverage; @@ -42,40 +44,40 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - * + *

*

* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *

- *
REF_N
- *
the reference base was an N, which is not considered callable the GATK
- *
CALLABLE
- *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
- *
NO_COVERAGE
- *
absolutely no reads were seen at this locus, regardless of the filtering parameters
- *
LOW_COVERAGE
- *
there were less than min. depth bases at the locus, after applying filters
- *
EXCESSIVE_COVERAGE
- *
more than -maxDepth read at the locus, indicating some sort of mapping problem
- *
POOR_MAPPING_QUALITY
- *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
PASS
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*
*

- * + *

*

Input

*

- * A BAM file containing exactly one sample. + * A BAM file containing exactly one sample. *

- * + *

*

Output

*

*

    - *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • - *
  • -summary: a table of callable status x count of all examined bases
  • + *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • + *
  • -summary: a table of callable status x count of all examined bases
  • *
*

- * + *

*

Examples

*
  *     -T CallableLociWalker \
@@ -83,31 +85,31 @@ import java.io.PrintStream;
  *     -summary my.summary \
  *     -o my.bed
  * 
- * + *

* would produce a BED file (my.bed) that looks like: - * + *

*

- *     20 10000000 10000864 CALLABLE
+ *     20 10000000 10000864 PASS
  *     20 10000865 10000985 POOR_MAPPING_QUALITY
- *     20 10000986 10001138 CALLABLE
+ *     20 10000986 10001138 PASS
  *     20 10001139 10001254 POOR_MAPPING_QUALITY
- *     20 10001255 10012255 CALLABLE
+ *     20 10001255 10012255 PASS
  *     20 10012256 10012259 POOR_MAPPING_QUALITY
- *     20 10012260 10012263 CALLABLE
+ *     20 10012260 10012263 PASS
  *     20 10012264 10012328 POOR_MAPPING_QUALITY
- *     20 10012329 10012550 CALLABLE
+ *     20 10012329 10012550 PASS
  *     20 10012551 10012551 LOW_COVERAGE
- *     20 10012552 10012554 CALLABLE
+ *     20 10012552 10012554 PASS
  *     20 10012555 10012557 LOW_COVERAGE
- *     20 10012558 10012558 CALLABLE
+ *     20 10012558 10012558 PASS
  *     et cetera...
  * 
* as well as a summary table that looks like: - * + *

*

  *                        state nBases
  *                        REF_N 0
- *                     CALLABLE 996046
+ *                     PASS 996046
  *                  NO_COVERAGE 121
  *                 LOW_COVERAGE 928
  *           EXCESSIVE_COVERAGE 0
@@ -139,21 +141,21 @@ public class CallableLociWalker extends LocusWalker minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
+     * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS
      * state.
      */
     @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
     byte minMappingQuality = 10;
 
     /**
-     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
+     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state
      */
     @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
     byte minBaseQuality = 20;
 
     /**
      * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
-     * value and is less than maxDepth the site is considered CALLABLE.
+     * value and is less than maxDepth the site is considered PASS.
      */
     @Advanced
     @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
@@ -191,7 +193,7 @@ public class CallableLociWalker extends LocusWalker= minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) {
+                if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
                     QCDepth++;
                 }
             }
 
             //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth);
-            if ( rawDepth == 0 ) {
+            if (rawDepth == 0) {
                 state = CalledState.NO_COVERAGE;
-            } else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) {
+            } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) {
                 state = CalledState.POOR_MAPPING_QUALITY;
-            } else if ( QCDepth < minDepth ) {
+            } else if (QCDepth < minDepth) {
                 state = CalledState.LOW_COVERAGE;
-            } else if ( rawDepth >= maxDepth && maxDepth != -1 ) {
+            } else if (rawDepth >= maxDepth && maxDepth != -1) {
                 state = CalledState.EXCESSIVE_COVERAGE;
             } else {
                 state = CalledState.CALLABLE;
             }
         }
 
-        return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
+        return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state);
     }
 
     @Override
@@ -328,15 +345,15 @@ public class CallableLociWalker extends LocusWalker upper - lower || lower < 1 ) {
-            throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+
-                    "lower bound must be at least 1, and number of bins may not exceed stop - start");
+            throw new UserException.BadInput("the start must be at least 1 and the number of bins may not exceed stop - start");
         }
 
         int[] binLeftEndpoints = new int[bins+1];
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
index 17b17764b..124be2eb4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
@@ -74,10 +74,6 @@ public class GCContentByIntervalWalker extends LocusWalker {
     public void initialize() {
     }
 
-    public boolean generateExtendedEvents() {
-        return false;
-    }
-
     public Long reduceInit() {
         return 0L;
     }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
index e7a2f74e2..10ac523e6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
@@ -91,6 +91,25 @@ public class ErrorRatePerCycle extends LocusWalker {
             this.cycle = cycle;
         }
 
+        // Must overload hashCode and equals to properly work with GATKReportColumn
+        @Override
+        public int hashCode() {
+            return readGroup.hashCode() + 33 * cycle;
+        }
+
+        @Override
+        public boolean equals(final Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            final TableKey oKey = (TableKey) o;
+
+            if ( cycle != oKey.cycle ) return false;
+            if ( !readGroup.equals(oKey.readGroup) ) return false;
+
+            return true;
+        }
+
         @Override
         public int compareTo(final TableKey tableKey) {
             final int scmp = readGroup.compareTo(tableKey.readGroup);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
index 60f20074a..5d1685557 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/CallableStatus.java
@@ -1,3 +1,27 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
 package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
 
 /**
@@ -7,16 +31,40 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
  * @since 2/1/12
  */
 public enum CallableStatus {
-    /** the reference base was an N, which is not considered callable the GATK */
-    REF_N,
-    /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
-    CALLABLE,
-    /** absolutely no reads were seen at this locus, regardless of the filtering parameters */
-    NO_COVERAGE,
-    /** there were less than min. depth bases at the locus, after applying filters */
-    LOW_COVERAGE,
-    /** more than -maxDepth read at the locus, indicating some sort of mapping problem */
-    EXCESSIVE_COVERAGE,
-    /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
-    POOR_QUALITY
+    /**
+     * the reference base was an N, which is not considered callable the GATK
+     */
+    // todo -- implement this status
+    REF_N("the reference base was an N, which is not considered callable the GATK"),
+    /**
+     * the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+     */
+    PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
+    /**
+     * absolutely no reads were seen at this locus, regardless of the filtering parameters
+     */
+    NO_COVERAGE("absolutely no reads were seen at this locus, regardless of the filtering parameters"),
+    /**
+     * there were less than min. depth bases at the locus, after applying filters
+     */
+    LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"),
+    /**
+     * more than -maxDepth read at the locus, indicating some sort of mapping problem
+     */
+    EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"),
+    /**
+     * more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+     */
+    POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"),
+
+    BAD_MATE(""),
+
+    INCONSISTENT_COVERAGE("");
+
+
+    public String description;
+
+    private CallableStatus(String description) {
+        this.description = description;
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
index 979fb665f..d73b22664 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java
@@ -1,45 +1,66 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
 package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
 
+import net.sf.picard.util.PeekableIterator;
 import org.broad.tribble.Feature;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Input;
-import org.broadinstitute.sting.commandline.IntervalBinding;
-import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.By;
-import org.broadinstitute.sting.gatk.walkers.DataSource;
-import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
 import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocComparator;
-import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
 
-import java.io.PrintStream;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.TreeSet;
+import java.util.*;
 
 /**
  * Short one line description of the walker.
- *
+ * 

*

* [Long description of the walker] *

- * - * + *

+ *

*

Input

*

* [Description of the Input] *

- * + *

*

Output

*

* [Description of the Output] *

- * + *

*

Examples

*
  *    java
@@ -51,15 +72,13 @@ import java.util.TreeSet;
  * @since 2/1/12
  */
 @By(value = DataSource.READS)
-public class DiagnoseTargets extends LocusWalker {
+@PartitionBy(PartitionType.INTERVAL)
+public class DiagnoseTargets extends LocusWalker implements AnnotatorCompatibleWalker {
     @Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
     private IntervalBinding intervalTrack = null;
 
-    @Output
-    private PrintStream out = System.out;
-
-    @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
-    private int expandInterval = 50;
+    @Output(doc = "File to which variants should be written", required = true)
+    private VCFWriter vcfWriter = null;
 
     @Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false)
     private int minimumBaseQuality = 20;
@@ -73,13 +92,11 @@ public class DiagnoseTargets extends LocusWalker {
     @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
     private int maximumCoverage = 700;
 
-    private TreeSet intervalList = null;                     // The list of intervals of interest (plus expanded intervals if user wants them)
-    private HashMap intervalMap = null;  // interval => statistics
-    private Iterator intervalListIterator;                   // An iterator to go over all the intervals provided as we traverse the genome
-    private GenomeLoc currentInterval = null;                           // The "current" interval loaded and being filled with statistics
-    private IntervalStatistics currentIntervalStatistics = null;                 // The "current" interval loaded and being filled with statistics
+    private HashMap intervalMap = null;                                                  // interval => statistics
+    private PeekableIterator intervalListIterator;                                                           // an iterator to go over all the intervals provided as we traverse the genome
+    private Set samples = null;                                                                                 // all the samples being processed
 
-    private GenomeLocParser parser;                                     // just an object to allow us to create genome locs (for the expanded intervals)
+    private final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times @Override public void initialize() { @@ -88,38 +105,22 @@ public class DiagnoseTargets extends LocusWalker { if (intervalTrack == null) throw new UserException("This tool currently only works if you provide an interval track"); - parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below + intervalMap = new HashMap(); + intervalListIterator = new PeekableIterator(intervalTrack.getIntervals(getToolkit()).listIterator()); - List originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided - intervalList = new TreeSet(new GenomeLocComparator()); - intervalMap = new HashMap(originalList.size() * 2); - for (GenomeLoc interval : originalList) - addAndExpandIntervalToLists(interval); - - intervalListIterator = intervalList.iterator(); + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header + vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - while (currentInterval == null || currentInterval.isBefore(refLocus)) { - if (!intervalListIterator.hasNext()) - return 0L; - currentInterval = intervalListIterator.next(); - currentIntervalStatistics = intervalMap.get(currentInterval); - } + removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore + addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus - if (currentInterval.isPast(refLocus)) - return 0L; - - byte[] mappingQualities = context.getBasePileup().getMappingQuals(); - byte[] baseQualities = context.getBasePileup().getQuals(); - int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); - int rawCoverage = context.size(); - - IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage); - currentIntervalStatistics.addLocus(refLocus, locusData); + for (IntervalStatistics intervalStatistics : intervalMap.values()) + intervalStatistics.addLocus(context); // Add current locus to stats return 1L; } @@ -129,44 +130,159 @@ public class DiagnoseTargets extends LocusWalker { return 0L; } + /** + * Not sure what we are going to do here + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return a long + */ @Override public Long reduce(Long value, Long sum) { return sum + value; } + /** + * Process all remaining intervals + * + * @param result number of loci processed by the walker + */ @Override public void onTraversalDone(Long result) { - super.onTraversalDone(result); - out.println("Interval\tCallStatus\tCOV\tAVG"); - for (GenomeLoc interval : intervalList) { - IntervalStatistics stats = intervalMap.get(interval); - out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage())); + for (GenomeLoc interval : intervalMap.keySet()) + processIntervalStats(intervalMap.get(interval), Allele.create("A")); + } + + @Override + public RodBinding getSnpEffRodBinding() {return null;} + + @Override + public RodBinding getDbsnpRodBinding() {return null;} + + @Override + public List> getCompRodBindings() {return null;} + + @Override + public List> getResourceRodBindings() {return null;} + + @Override + public boolean alwaysAppendDbsnpId() {return false;} + + /** + * Removes all intervals that are behind the current reference locus from the intervalMap + * + * @param refLocus the current reference locus + * @param refBase the reference allele + */ + private void removePastIntervals(GenomeLoc refLocus, byte refBase) { + List toRemove = new LinkedList(); + for (GenomeLoc interval : intervalMap.keySet()) + if (interval.isBefore(refLocus)) { + processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true)); + toRemove.add(interval); + } + + for (GenomeLoc interval : toRemove) + intervalMap.remove(interval); + + GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data + while(interval != null && interval.isBefore(refLocus)) { + interval = intervalListIterator.next(); + processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true)); + interval = intervalListIterator.peek(); } } - private GenomeLoc createIntervalBefore(GenomeLoc interval) { - int start = Math.max(interval.getStart() - expandInterval, 0); - int stop = Math.max(interval.getStart() - 1, 0); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); - } - - private GenomeLoc createIntervalAfter(GenomeLoc interval) { - int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength(); - int start = Math.min(interval.getStop() + 1, contigLimit); - int stop = Math.min(interval.getStop() + expandInterval, contigLimit); - return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); - } - - private void addAndExpandIntervalToLists(GenomeLoc interval) { - if (expandInterval > 0) { - GenomeLoc before = createIntervalBefore(interval); - GenomeLoc after = createIntervalAfter(interval); - intervalList.add(before); - intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + /** + * Adds all intervals that overlap the current reference locus to the intervalMap + * + * @param refLocus the current reference locus + */ + private void addNewOverlappingIntervals(GenomeLoc refLocus) { + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null && !interval.isPast(refLocus)) { + System.out.println("LOCUS : " + refLocus + " -- " + interval); + intervalMap.put(interval, createIntervalStatistic(interval)); + intervalListIterator.next(); // discard the interval (we've already added it to the map) + interval = intervalListIterator.peek(); } - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + } + + /** + * Takes the interval, finds it in the stash, prints it to the VCF, and removes it + * + * @param stats The statistics of the interval + * @param refAllele the reference allele + */ + private void processIntervalStats(IntervalStatistics stats, Allele refAllele) { + GenomeLoc interval = stats.getInterval(); + + List alleles = new ArrayList(); + Map attributes = new HashMap(); + ArrayList genotypes = new ArrayList(); + + alleles.add(refAllele); + alleles.add(SYMBOLIC_ALLELE); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); + + vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF + vcb.filters(statusesToStrings(stats.callableStatuses())); + + attributes.put(VCFConstants.END_KEY, interval.getStop()); + attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); + + vcb = vcb.attributes(attributes); + + for (String sample : samples) { + Map infos = new HashMap(); + infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage()); + + Set filters = new HashSet(); + filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); + + + genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + } + vcb = vcb.genotypes(genotypes); + + vcfWriter.add(vcb.make()); + + } + + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + private static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + + + private static Set statusesToStrings(Set statuses) { + Set output = new HashSet(statuses.size()); + + for (CallableStatus status : statuses) + output.add(status.name()); + + return output; + } + + private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { + return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java deleted file mode 100644 index 5620c3902..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; - -/** - * The definition of a locus for the DiagnoseTargets walker statistics calculation - * - * @author Mauricio Carneiro - * @since 2/3/12 - */ -class IntervalStatisticLocus { - private final byte[] mappingQuality; - private final byte[] baseQuality; - private final int coverage; - private final int rawCoverage; - - public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) { - this.mappingQuality = mappingQuality; - this.baseQuality = baseQuality; - this.coverage = coverage; - this.rawCoverage = rawCoverage; - } - - public IntervalStatisticLocus() { - this(new byte[1], new byte[1], 0, 0); - } - - public int getCoverage() { - return coverage; - } - - public int getRawCoverage() { - return rawCoverage; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 8ee5f76fb..f3246407b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -1,122 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; -/** - * Short one line description of the walker. - * - * @author Mauricio Carneiro - * @since 2/1/12 - */ -class IntervalStatistics { +public class IntervalStatistics { + + private final Map samples; private final GenomeLoc interval; - private final ArrayList loci; - private final int minimumCoverageThreshold; - private final int maximumCoverageThreshold; - private final int minimumMappingQuality; - private final int minimumBaseQuality; + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private IntervalStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + public IntervalStatistics(Set samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { this.interval = interval; - this.loci = loci; - this.minimumCoverageThreshold = minimumCoverageThreshold; - this.maximumCoverageThreshold = maximumCoverageThreshold; - this.minimumMappingQuality = minimumMappingQuality; - this.minimumBaseQuality = minimumBaseQuality; + this.samples = new HashMap(samples.size()); + for (String sample : samples) + this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality)); } - public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { - this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + public SampleStatistics getSample(String sample) { + return samples.get(sample); + } - // Initialize every loci (this way we don't have to worry about non-existent loci in the object - for (int i = 0; i < interval.size(); i++) - this.loci.add(i, new IntervalStatisticLocus()); + public GenomeLoc getInterval() { + return interval; + } + + public void addLocus(AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + + Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); + + for (Map.Entry entry : samplePileups.entrySet()) { + String sample = entry.getKey(); + ReadBackedPileup samplePileup = entry.getValue(); + SampleStatistics sampleStatistics = samples.get(sample); + + if (sampleStatistics == null) + throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample)); + + sampleStatistics.addLocus(context.getLocation(), samplePileup); + } } - public long totalCoverage() { - if (preComputedTotalCoverage < 0) - calculateTotalCoverage(); - return preComputedTotalCoverage; - } public double averageCoverage() { if (preComputedTotalCoverage < 0) calculateTotalCoverage(); - return (double) preComputedTotalCoverage / loci.size(); - } - - /** - * Calculates the callable status of the entire interval - * - * @return the callable status of the entire interval - */ - public CallableStatus callableStatus() { - long max = -1; - CallableStatus maxCallableStatus = null; - HashMap statusCounts = new HashMap(CallableStatus.values().length); - - // initialize the statusCounts with all callable states - for (CallableStatus key : CallableStatus.values()) - statusCounts.put(key, 0); - - // calculate the callable status for each locus - for (int i = 0; i < loci.size(); i++) { - CallableStatus status = callableStatus(i); - int count = statusCounts.get(status) + 1; - statusCounts.put(status, count); - - if (count > max) { - max = count; - maxCallableStatus = status; - } - } - - return maxCallableStatus; - } - - public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) { - if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); - - int locusIndex = locus.getStart() - interval.getStart(); - - loci.add(locusIndex, locusData); - } - - /** - * returns the callable status of this locus without taking the reference base into account. - * - * @param locusIndex location in the genome to inquire (only one locus) - * @return the callable status of a locus - */ - private CallableStatus callableStatus(int locusIndex) { - if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold) - return CallableStatus.EXCESSIVE_COVERAGE; - - if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold) - return CallableStatus.CALLABLE; - - if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold) - return CallableStatus.POOR_QUALITY; - - if (loci.get(locusIndex).getRawCoverage() > 0) - return CallableStatus.LOW_COVERAGE; - - return CallableStatus.NO_COVERAGE; + return (double) preComputedTotalCoverage / interval.size(); } private void calculateTotalCoverage() { preComputedTotalCoverage = 0; - for (IntervalStatisticLocus locus : loci) - preComputedTotalCoverage += locus.getCoverage(); + for (SampleStatistics sample : samples.values()) + preComputedTotalCoverage += sample.totalCoverage(); } + /** + * Return the Callable statuses for the interval as a whole + * todo -- add a voting system for sample flags and add interval specific statuses + * + * @return the callable status(es) for the whole interval + */ + public Set callableStatuses() { + Set output = new HashSet(); + + for (SampleStatistics sample : samples.values()) + output.addAll(sample.getCallableStatuses()); + + return output; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java new file mode 100644 index 000000000..237ca1b1c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import java.util.HashSet; +import java.util.Set; + +public class LocusStatistics { + final int coverage; + final int rawCoverage; + + public LocusStatistics() { + this.coverage = 0; + this.rawCoverage = 0; + } + + public LocusStatistics(int coverage, int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; + } + + public int getCoverage() { + return coverage; + } + + public int getRawCoverage() { + return rawCoverage; + } + + /** + * Generates all applicable statuses from the coverages in this locus + * + * @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality + * @param maximumCoverageThreshold the maximum threshold for determining excessive coverage + * @return a set of all statuses that apply + */ + public Set callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) { + Set output = new HashSet(); + + // if too much coverage + if (getCoverage() > maximumCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + // if not enough coverage + if (getCoverage() < minimumCoverageThreshold) { + // was there a lot of low Qual coverage? + if (getRawCoverage() >= minimumCoverageThreshold) + output.add(CallableStatus.POOR_QUALITY); + // no? + else { + // is there any coverage? + if (getRawCoverage() > 0) + output.add(CallableStatus.LOW_COVERAGE); + else + output.add(CallableStatus.NO_COVERAGE); + } + } + + return output; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java new file mode 100644 index 000000000..9e4993853 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.*; + +/** + * Short one line description of the walker. + * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +class SampleStatistics { + private final GenomeLoc interval; + private final ArrayList loci; + + private final int minimumCoverageThreshold; + private final int maximumCoverageThreshold; + private final int minimumMappingQuality; + private final int minimumBaseQuality; + + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + + private SampleStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this.interval = interval; + this.loci = loci; + this.minimumCoverageThreshold = minimumCoverageThreshold; + this.maximumCoverageThreshold = maximumCoverageThreshold; + this.minimumMappingQuality = minimumMappingQuality; + this.minimumBaseQuality = minimumBaseQuality; + } + + public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + + // Initialize every loci (this way we don't have to worry about non-existent loci in the object + for (int i = 0; i < interval.size(); i++) + this.loci.add(i, new LocusStatistics()); + + } + + public long totalCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return preComputedTotalCoverage; + } + + public double averageCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return (double) preComputedTotalCoverage / loci.size(); + } + + /** + * Calculates the callable statuses of the entire interval + * + * @return the callable statuses of the entire interval + */ + public Set getCallableStatuses() { + + Map totals = new HashMap(CallableStatus.values().length); + + // initialize map + for (CallableStatus status : CallableStatus.values()) + totals.put(status, 0); + + // sum up all the callable statuses for each locus + for (int i = 0; i < interval.size(); i++) { + for (CallableStatus status : callableStatus(i)) { + int count = totals.get(status); + + totals.put(status, count + 1); + } + } + + + Set output = new HashSet(); + + // double to avoid type casting + double intervalSize = interval.size(); + + double coverageStatusThreshold = 0.20; + if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.NO_COVERAGE); + + if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.LOW_COVERAGE); + + double excessiveCoverageThreshold = 0.20; + if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + double qualityStatusThreshold = 0.50; + if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold) + output.add(CallableStatus.POOR_QUALITY); + + if (totals.get(CallableStatus.REF_N) > 0) + output.add(CallableStatus.REF_N); + + if (output.isEmpty()) { + output.add(CallableStatus.PASS); + } + return output; + } + + /** + * Adds a locus to the interval wide stats + * + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus + */ + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); + + // a null pileup means there nothing ot add + if (pileup != null) { + + int locusIndex = locus.getStart() - interval.getStart(); + + int rawCoverage = pileup.depthOfCoverage(); + int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + + LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); + + loci.add(locusIndex, locusData); + } + } + + /** + * returns the callable status of this locus without taking the reference base into account. + * + * @param locusIndex location in the genome to inquire (only one locus) + * @return the callable status of a locus + */ + private Set callableStatus(int locusIndex) { + LocusStatistics locus = loci.get(locusIndex); + + return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold); + } + + + private void calculateTotalCoverage() { + preComputedTotalCoverage = 0; + for (LocusStatistics locus : loci) + preComputedTotalCoverage += locus.getCoverage(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index 2159bc839..3f4b4805f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -244,7 +244,8 @@ public class DiffEngine { table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount()); table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString()); } - table.write(params.out); + GATKReport output = new GATKReport(table); + output.print(params.out); } protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java index 41b17cc7b..2fa566c09 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java @@ -36,9 +36,14 @@ import java.io.IOException; /** * Class implementing diffnode reader for GATKReports */ + +// TODO Version check to be added at the report level + public class GATKReportDiffableReader implements DiffableReader { @Override - public String getName() { return "GATKReport"; } + public String getName() { + return "GATKReport"; + } @Override public DiffElement readFromFile(File file, int maxElementsToRead) { @@ -47,12 +52,12 @@ public class GATKReportDiffableReader implements DiffableReader { // one line reads the whole thing into memory GATKReport report = new GATKReport(file); - for (GATKReportTable table : report.getTables() ) { + for (GATKReportTable table : report.getTables()) { root.add(tableToNode(table, root)); } return root.getBinding(); - } catch ( Exception e ) { + } catch (Exception e) { return null; } } @@ -62,9 +67,8 @@ public class GATKReportDiffableReader implements DiffableReader { tableRoot.add("Description", table.getTableDescription()); tableRoot.add("NumberOfRows", table.getNumRows()); - tableRoot.add("Version", table.getVersion()); - for ( GATKReportColumn column : table.getColumns().values() ) { + for (GATKReportColumn column : table.getColumns().values()) { DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot); columnRoot.add("Width", column.getColumnFormat().getWidth()); @@ -72,7 +76,7 @@ public class GATKReportDiffableReader implements DiffableReader { columnRoot.add("Displayable", column.isDisplayable()); int n = 1; - for ( Object elt : column.values() ) { + for (Object elt : column.values()) { String name = column.getColumnName() + n++; columnRoot.add(name, elt.toString()); } @@ -91,7 +95,7 @@ public class GATKReportDiffableReader implements DiffableReader { new FileReader(file).read(buff, 0, HEADER.length()); String firstLine = new String(buff); return firstLine.startsWith(HEADER); - } catch ( IOException e ) { + } catch (IOException e) { return false; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 3c0da8e9d..c9a6cb8f2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -68,8 +68,8 @@ public class VCFDiffableReader implements DiffableReader { VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader); for ( VCFHeaderLine headerLine : header.getMetaData() ) { String key = headerLine.getKey(); - if ( headerLine instanceof VCFNamedHeaderLine ) - key += "_" + ((VCFNamedHeaderLine) headerLine).getName(); + if ( headerLine instanceof VCFIDHeaderLine) + key += "_" + ((VCFIDHeaderLine) headerLine).getID(); if ( root.hasElement(key) ) logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString()); else diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 9f2403bbf..be4ceae53 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -26,10 +26,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; +import java.util.ArrayList; import java.util.List; @@ -41,7 +45,8 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { /** The default model with the best performance in all cases */ - EXACT + EXACT, + POOL } protected int N; @@ -61,6 +66,42 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { this.verboseWriter = verboseWriter; } + /** + * Wrapper class that compares two likelihoods associated with two alleles + */ + protected static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele allele; + + public LikelihoodSum(Allele allele) { this.allele = allele; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + /** + * Unpack GenotypesContext into arraylist of doubel values + * @param GLs Input genotype context + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(GenotypesContext GLs) { + ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); + + genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { + if ( sample.hasLikelihoods() ) { + double[] gls = sample.getLikelihoods().getAsVector(); + + if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) + genotypeLikelihoods.add(gls); + } + } + + return genotypeLikelihoods; + } + /** * Must be overridden by concrete subclasses * @param vc variant context with alleles and genotype likelihoods @@ -69,6 +110,19 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @return the alleles used for genotyping */ protected abstract List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result); + + /** + * Must be overridden by concrete subclasses + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes + * @param ploidy + * @return GenotypesContext object + */ + protected abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 9c4af8512..c93e780bf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -25,6 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Arrays; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -34,23 +38,50 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; */ public class AlleleFrequencyCalculationResult { - // IMPORTANT NOTE: - // These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N). - // For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that - // the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may - // be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0). - // In the bi-allelic case (where there are no other alternate alleles over which to marginalize), - // the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED. - final double[][] log10AlleleFrequencyLikelihoods; - final double[][] log10AlleleFrequencyPosteriors; + // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles + private double log10MLE; + private double log10MAP; + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; - // These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - double log10LikelihoodOfAFzero = 0.0; - double log10PosteriorOfAFzero = 0.0; + // The posteriors seen, not including that of AF=0 + private static final int POSTERIORS_CACHE_SIZE = 5000; + private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE]; + private int currentPosteriorsCacheIndex = 0; + private Double log10PosteriorMatrixSum = null; - public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { - log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; - log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + private double log10LikelihoodOfAFzero; + private double log10PosteriorOfAFzero; + + + public AlleleFrequencyCalculationResult(final int maxAltAlleles) { + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + reset(); + } + + public double getLog10MLE() { + return log10MLE; + } + + public double getLog10MAP() { + return log10MAP; + } + + public double getLog10PosteriorsMatrixSumWithoutAFzero() { + if ( log10PosteriorMatrixSum == null ) { + log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); + } + return log10PosteriorMatrixSum; + } + + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; } public double getLog10LikelihoodOfAFzero() { @@ -60,4 +91,60 @@ public class AlleleFrequencyCalculationResult { public double getLog10PosteriorOfAFzero() { return log10PosteriorOfAFzero; } + + public void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { + alleleCountsOfMLE[i] = 0; + alleleCountsOfMAP[i] = 0; + } + currentPosteriorsCacheIndex = 0; + log10PosteriorMatrixSum = null; + } + + public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMLE[i] = alleleCountsForK[i]; + } + } + + public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToPosteriorsCache(log10LofK); + + if ( log10LofK > log10MAP ) { + log10MAP = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMAP[i] = alleleCountsForK[i]; + } + } + + private void addToPosteriorsCache(final double log10LofK) { + // add to the cache + log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) { + final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); + log10PosteriorMatrixValues[0] = temporarySum; + currentPosteriorsCacheIndex = 1; + } + } + + public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java new file mode 100644 index 000000000..e64a4f42d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +/** + * Code for determining which indels are segregating among the samples. + * + * This code is just a refactor of the original code from Guillermo in the UG. + * + * @author Mark DePristo + * @since 3/26/12 + */ +public class ConsensusAlleleCounter { + final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class); + private final int minIndelCountForGenotyping; + private final boolean doMultiAllelicCalls; + private final double minFractionInOneSample; + private final GenomeLocParser locParser; + + public ConsensusAlleleCounter(final GenomeLocParser locParser, + final boolean doMultiAllelicCalls, + final int minIndelCountForGenotyping, + final double minFractionInOneSample) { + this.minIndelCountForGenotyping = minIndelCountForGenotyping; + this.doMultiAllelicCalls = doMultiAllelicCalls; + this.minFractionInOneSample = minFractionInOneSample; + this.locParser = locParser; + } + + /** + * Returns a list of Alleles at this locus that may be segregating + * + * @param ref + * @param contexts + * @param contextType + * @return + */ + public List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType); +// logger.info("Alleles at " + ref.getLocus()); +// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) { +// logger.info(" " + elt.getValue() + " => " + elt.getKey()); +// } + return consensusCountsToAlleles(ref, consensusIndelStrings); + } + + // + // TODO -- WARNING DOESN'T WORK WITH REDUCED READS + // + private Map countConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType) { + final GenomeLoc loc = ref.getLocus(); + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for ( Map.Entry sample : contexts.entrySet() ) { + final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + if ( context.hasBasePileup() ) { + final ReadBackedPileup indelPileup = context.getBasePileup(); + insCount += indelPileup.getNumberOfInsertionsAfterThisElement(); + delCount += indelPileup.getNumberOfDeletionsAfterThisElement(); + } + } + + if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping ) + return Collections.emptyMap(); + + for (Map.Entry sample : contexts.entrySet()) { + // todo -- warning, can be duplicating expensive partition here + AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + + if ( !context.hasBasePileup() ) + continue; + + final ReadBackedPileup indelPileup = context.getBasePileup(); + + final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement(); + final int nReadsOverall = indelPileup.getNumberOfElements(); + + if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) { +// if ( nIndelReads > 0 ) +// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + continue; +// } else { +// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall); + } + + + for (PileupElement p : indelPileup) { + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read == null) + continue; + if (ReadUtils.is454Read(read)) { + continue; + } + +/* if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + */ + String indelString = p.getEventBases(); + + if ( p.isBeforeInsertion() ) { + // edge case: ignore a deletion immediately preceding an insertion as p.getEventBases() returns null [EB] + if ( indelString == null ) + continue; + + boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map + if (s.startsWith(indelString)) { + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + if (s.endsWith(indelString)) { + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); + foundKey = true; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + foundKey = true; + cList.set(k,new Pair(indelString,cnt+1)); + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + cList.add(new Pair(indelString,1)); + + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); + } + + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if ( p.isBeforeDeletedBase() ) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + } + + return consensusIndelStrings; + } + + private List consensusCountsToAlleles(final ReferenceContext ref, + final Map consensusIndelStrings) { + final GenomeLoc loc = ref.getLocus(); + final Collection vcs = new ArrayList(); + int maxAlleleCnt = 0; + Allele refAllele, altAllele; + + for (final Map.Entry elt : consensusIndelStrings.entrySet()) { + final String s = elt.getKey(); + final int curCnt = elt.getValue(); + int stop = 0; + + // if observed count if above minimum threshold, we will genotype this allele + if (curCnt < minIndelCountForGenotyping) + continue; + + if (s.startsWith("D")) { + // get deletion length + final int dLen = Integer.valueOf(s.substring(1)); + // get ref bases of accurate deletion + final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); + stop = loc.getStart() + dLen; + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); + + if (Allele.acceptableAlleleBases(refBases)) { + refAllele = Allele.create(refBases, true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else continue; // don't go on with this allele if refBases are non-standard + } else { + // insertion case + if (Allele.acceptableAlleleBases(s)) { + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(s, false); + stop = loc.getStart(); + } + else continue; // go on to next allele if consensus insertion has any non-standard base. + } + + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(Arrays.asList(refAllele, altAllele)); + builder.referenceBaseForIndel(ref.getBase()); + builder.noGenotypes(); + if (doMultiAllelicCalls) { + vcs.add(builder.make()); + if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) + break; + } else if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + vcs.clear(); + vcs.add(builder.make()); + } + } + + if (vcs.isEmpty()) + return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion + + final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + return mergedVC.getAlleles(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 4aa580052..9ba565ad3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.BaseUtils; +@Deprecated public enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeWithCorrectAlleleOrdering.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeWithCorrectAlleleOrdering.java new file mode 100755 index 000000000..83c499144 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeWithCorrectAlleleOrdering.java @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.BaseUtils; + +public enum DiploidGenotypeWithCorrectAlleleOrdering { + AA ('A', 'A'), + AC ('A', 'C'), + CC ('C', 'C'), + AG ('A', 'G'), + CG ('C', 'G'), + GG ('G', 'G'), + AT ('A', 'T'), + CT ('C', 'T'), + GT ('G', 'T'), + TT ('T', 'T'); + + public byte base1, base2; + + @Deprecated + private DiploidGenotypeWithCorrectAlleleOrdering(char base1, char base2) { + this((byte)base1, (byte)base2); + } + + private DiploidGenotypeWithCorrectAlleleOrdering(byte base1, byte base2) { + this.base1 = base1; + this.base2 = base2; + } + + public boolean isHomRef(byte r) { + return isHom() && r == base1; + } + + public boolean isHomVar(byte r) { + return isHom() && r != base1; + } + + public boolean isHetRef(byte r) { + if ( base1 == r ) + return r != base2; + else + return base2 == r; + } + + public boolean isHom() { + return ! isHet(); + } + + public boolean isHet() { + return base1 != base2; + } + + /** + * create a diploid genotype, given a character to make into a hom genotype + * @param hom the character to turn into a hom genotype, i.e. if it is A, then returned will be AA + * @return the diploid genotype + */ + public static DiploidGenotypeWithCorrectAlleleOrdering createHomGenotype(byte hom) { + int index = BaseUtils.simpleBaseToBaseIndex(hom); + if ( index == -1 ) + throw new IllegalArgumentException(hom + " is not a valid base character"); + return conversionMatrix[index][index]; + } + + /** + * create a diploid genotype, given 2 chars which may not necessarily be ordered correctly + * @param base1 base1 + * @param base2 base2 + * @return the diploid genotype + */ + public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(byte base1, byte base2) { + int index1 = BaseUtils.simpleBaseToBaseIndex(base1); + if ( index1 == -1 ) + throw new IllegalArgumentException(base1 + " is not a valid base character"); + int index2 = BaseUtils.simpleBaseToBaseIndex(base2); + if ( index2 == -1 ) + throw new IllegalArgumentException(base2 + " is not a valid base character"); + return conversionMatrix[index1][index2]; + } + + /** + * create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly + * @param baseIndex1 base1 + * @param baseIndex2 base2 + * @return the diploid genotype + */ + public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(int baseIndex1, int baseIndex2) { + if ( baseIndex1 == -1 ) + throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character"); + if ( baseIndex2 == -1 ) + throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character"); + return conversionMatrix[baseIndex1][baseIndex2]; + } + + private static final DiploidGenotypeWithCorrectAlleleOrdering[][] conversionMatrix = { + { DiploidGenotypeWithCorrectAlleleOrdering.AA, DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.AT }, + { DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.CC, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.CT }, + { DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.GG, DiploidGenotypeWithCorrectAlleleOrdering.GT }, + { DiploidGenotypeWithCorrectAlleleOrdering.AT, DiploidGenotypeWithCorrectAlleleOrdering.CT, DiploidGenotypeWithCorrectAlleleOrdering.GT, DiploidGenotypeWithCorrectAlleleOrdering.TT } + }; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java deleted file mode 100755 index d8c911092..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java +++ /dev/null @@ -1,122 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; -import org.broadinstitute.sting.utils.MathUtils; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: Sep 30, 2010 - * Time: 1:47:55 PM - * To change this template use File | Settings | File Templates. - */ -public class DiploidIndelGenotypePriors implements GenotypePriors { - // -------------------------------------------------------------------------------------------------------------- - // - // Constants and static information - // - // -------------------------------------------------------------------------------------------------------------- - public static final double INDEL_HETEROZYGOSITY = 1e-4; - - private final static double[] flatPriors = new double[DiploidGenotype.values().length]; - - // -------------------------------------------------------------------------------------------------------------- - // - // Diploid priors - // - // -------------------------------------------------------------------------------------------------------------- - private double[] priors = null; - - /** - * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype - */ - public DiploidIndelGenotypePriors() { - priors = flatPriors.clone(); - } - - public DiploidIndelGenotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) { - double varPrior = getHaplotypePriors(indelHeterozygosity, eventLength, haplotypeSize); - priors[2] = Math.log10(varPrior*varPrior); - priors[1] = Math.log10(2*varPrior*(1-varPrior)); - priors[0] = Math.log10((1-varPrior)*(1-varPrior)); - - } - - - /** - * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return log10 prior as a double array - */ - public double[] getPriors() { - return priors; - } - - /** - * Returns the prior associated with DiploidGenotype g - * @param g - * @return log10 prior as a double - */ - public double getPrior(DiploidGenotype g) { - return getPriors()[g.ordinal()]; - } - - public double getHeterozygosity() { return INDEL_HETEROZYGOSITY; } - - public boolean validate(boolean throwException) { - try { - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - int i = g.ordinal(); - if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) { - String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i])); - throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - public double getHaplotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) { - // compute prior likelihoods on haplotypes. - // In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption), - // and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of - // another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances - // by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval - // of size L is Poisson-distributed with parameter lambda = theta*L. - - // Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it - // by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and - // pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda. - - // We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006) - // and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)), - // taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one. - // so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L). - // Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval) - // and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be - // added to ref hypothesis. - // Equation above is prior model. - - double lambda = (double)haplotypeSize * indelHeterozygosity; - return HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength - + HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5); - - - - } - - - static { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length); - } - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 7143606ae..76849a4dd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -70,6 +70,7 @@ import static java.lang.Math.pow; * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above * model. */ +@Deprecated public class DiploidSNPGenotypeLikelihoods implements Cloneable { public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java new file mode 100755 index 000000000..5d6cf9f7d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.List; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; + +/** + * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(G | D) = P(G) * P(D | G) + * + * where + * + * P(D | G) = sum_i log10 P(bi | G) + * + * and + * + * P(bi | G) = 1 - P(error | q1) if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for homozygous genotypes and for heterozygous genotypes: + * + * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT + * + * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. + * + * The priors contain the relative probabilities of each genotype, and must be provided at object creation. + * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above + * model. + */ +public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable { + + public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; + + protected final static int FIXED_PLOIDY = 2; + protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; + protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); + protected final static double log10_3 = log10(3.0); + + protected boolean VERBOSE = false; + + // + // The fundamental data arrays associated with a Genotype Likelihoods object + // + protected double[] log10Likelihoods = null; + + // TODO: don't calculate this each time through + protected double log10_PCR_error_3; + protected double log10_1_minus_PCR_error; + + /** + * Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype + * + * @param PCR_error_rate the PCR error rate + */ + public DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(double PCR_error_rate) { + log10_PCR_error_3 = log10(PCR_error_rate) - log10_3; + log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate); + setToZero(); + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering c = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)super.clone(); + c.log10Likelihoods = log10Likelihoods.clone(); + return c; + } + + protected void setToZero() { + log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros + } + + /** + * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() + * @return likelihoods array + */ + public double[] getLikelihoods() { + return log10Likelihoods; + } + + // ------------------------------------------------------------------------------------- + // + // add() routines. These are the workhorse routines for calculating the overall genotype + // likelihoods given observed bases and reads. Includes high-level operators all the + // way down to single base and qual functions. + // + // ------------------------------------------------------------------------------------- + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup read pileup + * @param ignoreBadBases should we ignore bad bases? + * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? + * @param minBaseQual the minimum base quality at which to consider a base valid + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + int n = 0; + + // for each fragment, add to the likelihoods + FragmentCollection fpile = pileup.toFragments(); + + for ( PileupElement p : fpile.getSingletonReads() ) + n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + + for ( List overlappingPair : fpile.getOverlappingPairs() ) + n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + + return n; + } + + public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + return 0; + + if ( elt.getRead().isReducedRead() ) { + // reduced read representation + if ( BaseUtils.isRegularBase( obsBase )) { + int representativeCount = elt.getRepresentativeCount(); + add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods + return representativeCount; // we added nObs bases here + } + + // odd bases or deletions => don't use them + return 0; + } + + return add(obsBase, qual, (byte)0, (byte)0, 1); + } + + public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + final PileupElement p1 = overlappingPair.get(0); + final PileupElement p2 = overlappingPair.get(1); + + final byte observedBase1 = p1.getBase(); + final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + final byte observedBase2 = p2.getBase(); + final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + + if ( qualityScore1 == 0 ) { + if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases + return 0; + else { + return add(observedBase2, qualityScore2, (byte)0, (byte)0); + } + } else { + return add(observedBase1, qualityScore1, observedBase2, qualityScore2); + } + } + + /** + * + * @param obsBase1 first observed base + * @param qual1 base qual of first observed base + * @param obsBase2 second observed base + * @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment + * @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads + * @return 0 if the base is bad, 1 otherwise + */ + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { + // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine + // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. + // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. + + // Just look up the cached result if it's available, or compute and store it + DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl; + if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) { + gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); + } else { + gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); + } + + // for bad bases, there are no likelihoods + if ( gl == null ) + return 0; + + double[] likelihoods = gl.getLikelihoods(); + + for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { + double likelihood = likelihoods[g.ordinal()]; + log10Likelihoods[g.ordinal()] += likelihood * nObs; + } + + return 1; + } + + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + return add(obsBase1, qual1, obsBase2, qual2, 1); + } + + // ------------------------------------------------------------------------------------- + // + // Dealing with the cache routines + // + // ------------------------------------------------------------------------------------- + + static DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] CACHE = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY]; + + protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null; + } + + protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy); + if ( gl == null ) + throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d", + observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy)); + return gl; + } + + protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); + setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl); + return gl; + } + + protected void setCache( DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache, + byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy, + DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering val ) { + int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); + int j = qualityScore1; + int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; + int l = qualityScore2; + int m = ploidy; + + cache[i][j][k][l][m] = val; + } + + protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCache(DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache, + byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { + int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); + int j = qualityScore1; + int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; + int l = qualityScore2; + int m = ploidy; + return cache[i][j][k][l][m]; + } + + protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { + double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); + + try { + + DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)this.clone(); + gl.setToZero(); + + // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space + for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { + + // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop + double p_base = 0.0; + p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); + p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); + + final double likelihood = log10(p_base); + gl.log10Likelihoods[g.ordinal()] += likelihood; + } + + if ( VERBOSE ) { + for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%s\t", g); } + System.out.println(); + for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } + System.out.println(); + } + + return gl; + + } catch ( CloneNotSupportedException e ) { + throw new RuntimeException(e); + } + } + + /** + * Updates likelihoods and posteriors to reflect an additional observation of observedBase with + * qualityScore. + * + * @param observedBase1 the base observed on the 1st read of the fragment + * @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA + * @param observedBase2 the base observed on the 2nd read of the fragment + * @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA + * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) + */ + protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { + double[] log10FourBaseLikelihoods = baseZeros.clone(); + + for ( byte trueBase : BaseUtils.BASES ) { + double likelihood = 0.0; + + for ( byte fragmentBase : BaseUtils.BASES ) { + double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3); + if ( qualityScore1 != 0 ) { + log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1); + } + if ( qualityScore2 != 0 ) { + log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2); + } + + //if ( VERBOSE ) { + // System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n", + // observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood); + //} + + likelihood += pow(10, log10FragmentLikelihood); + } + + log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood); + } + + return log10FourBaseLikelihoods; + } + + /** + * + * @param observedBase observed base + * @param chromBase target base + * @param qual base quality + * @return log10 likelihood + */ + protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) { + + double logP; + + if ( observedBase == chromBase ) { + // the base is consistent with the chromosome -- it's 1 - e + //logP = oneMinusData[qual]; + double e = pow(10, (qual / -10.0)); + logP = log10(1.0 - e); + } else { + // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) + logP = qual / -10.0 + (-log10_3); + } + + //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); + return logP; + } + + /** + * Helper function that returns the phred-scaled base quality score we should use for calculating + * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may + * cap the quality score by the mapping quality of the read itself. + * + * @param p Pileup element + * @param ignoreBadBases Should we ignore bad bases? + * @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read? + * @param minBaseQual Minimum allowed base quality + * @return the actual base quality to use + */ + private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) + return 0; + + byte qual = p.getQual(); + + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + if ( capBaseQualsAtMappingQual ) + qual = (byte)Math.min((int)qual, p.getMappingQual()); + if ( (int)qual < minBaseQual ) + qual = (byte)0; + + return qual; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // helper routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Return a string representation of this object in a moderately usable form + * + * @return string representation + */ + public String toString() { + double sum = 0; + StringBuilder s = new StringBuilder(); + for (DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values()) { + s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()])); + sum += Math.pow(10,log10Likelihoods[g.ordinal()]); + } + s.append(String.format(" %f", sum)); + return s.toString(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Validation routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + public boolean validate() { + return validate(true); + } + + public boolean validate(boolean throwException) { + try { + for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { + String bad = null; + + int i = g.ordinal(); + if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { + bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); + } + + if ( bad != null ) { + throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + // + // Constant static data + // + private final static double[] genotypeZeros = new double[DiploidGenotypeWithCorrectAlleleOrdering.values().length]; + private final static double[] baseZeros = new double[BaseUtils.BASES.length]; + + static { + for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { + genotypeZeros[g.ordinal()] = 0.0; + } + for ( byte base : BaseUtils.BASES ) { + baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java index 71854591f..86079b6e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.utils.MathUtils; import java.util.Arrays; +@Deprecated public class DiploidSNPGenotypePriors implements GenotypePriors { // -------------------------------------------------------------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 6c7dc0dcd..608a29e38 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -43,7 +43,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } public List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { GenotypesContext GLs = vc.getGenotypes(); @@ -56,26 +56,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE)); - GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); + GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false); } - //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); return alleles; } - private static final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele allele; - - public LikelihoodSum(Allele allele) { this.allele = allele; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } private static final int PL_INDEX_OF_HOM_REF = 0; private static final List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { @@ -113,22 +101,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return orderedBestAlleles; } - private static final ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); - - genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { - if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); - - if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) - genotypeLikelihoods.add(gls); - } - } - - return genotypeLikelihoods; - } - // ------------------------------------------------------------------------------------- // // Multi-allelic implementation. @@ -153,7 +125,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { @Override public boolean equals(Object obj) { - return (obj instanceof ExactACcounts) ? Arrays.equals(counts, ((ExactACcounts)obj).counts) : false; + return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts)obj).counts); } @Override @@ -203,24 +175,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } public boolean equals(Object obj) { - return (obj instanceof ExactACset) ? ACcounts.equals(((ExactACset)obj).ACcounts) : false; + return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts); } } - // TODO -- remove me public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, - final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final boolean foo) { - linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result); - } - - - - public static void linearExactMultiAllelic(final GenotypesContext GLs, - final int numAlternateAlleles, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { final ArrayList genotypeLikelihoods = getGLs(GLs); @@ -272,7 +233,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { //if ( DEBUG ) @@ -360,7 +321,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case @@ -370,47 +331,39 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( totalK == 0 ) { for ( int j = 1; j < set.log10Likelihoods.length; j++ ) set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; } - // k > 0 for at least one k - else { - // the non-AA possible conformations were dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); - } + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; } - final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - // determine the power of theta to use - int nonRefAlleles = 0; - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - if ( set.ACcounts.getCounts()[i] > 0 ) - nonRefAlleles++; - } - - // for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object - if ( nonRefAlleles == 0 ) { - result.log10LikelihoodOfAFzero = log10LofK; - result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0]; - } else { - // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - int AC = set.ACcounts.getCounts()[i]; - result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); - - final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); - } + // update the MLE if necessary + result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); + + // apply the priors over each alternate allele + for ( final int ACcount : set.ACcounts.getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; } + result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); } private static void pushData(final ExactACset targetSet, @@ -466,6 +419,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return coeff; } + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + } // ------------------------------------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index fb2428258..f8924bed3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -47,9 +47,17 @@ import java.util.Map; */ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { +/* public enum Model { + SNP, + INDEL, + BOTH + } + */ public enum Model { SNP, INDEL, + POOLSNP, + POOLINDEL, BOTH } @@ -60,7 +68,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { GENOTYPE_GIVEN_ALLELES } - protected UnifiedArgumentCollection UAC; + protected final UnifiedArgumentCollection UAC; protected Logger logger; /** @@ -70,7 +78,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { */ protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { if ( logger == null || UAC == null ) throw new ReviewedStingException("Bad arguments"); - this.UAC = UAC.clone(); + this.UAC = UAC; this.logger = logger; } @@ -81,7 +89,6 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param ref reference context * @param contexts stratified alignment contexts * @param contextType stratified context type - * @param priors priors to use for GLs * @param alternateAllelesToUse the alternate allele to use, null if not set * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @param locParser Genome Loc Parser @@ -91,7 +98,6 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenotypePriors priors, final List alternateAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index e41998cb3..31decbb79 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -35,16 +35,9 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; -import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -52,11 +45,9 @@ import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final int HAPLOTYPE_SIZE; - private final int minIndelCountForGenotyping; private final boolean getAlleleListFromVCF; private boolean DEBUG = false; - private final boolean doMultiAllelicCalls = true; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; @@ -72,7 +63,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup private GenomeLoc lastSiteVisited; - private ArrayList alleleList; + private List alleleList = new ArrayList(); static { indelLikelihoodMap.set(new HashMap>()); @@ -83,204 +74,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - - private ArrayList computeConsensusAlleles(ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) { - Allele refAllele = null, altAllele = null; - GenomeLoc loc = ref.getLocus(); - ArrayList aList = new ArrayList(); - - HashMap consensusIndelStrings = new HashMap(); - - int insCount = 0, delCount = 0; - // quick check of total number of indels in pileup - for (Map.Entry sample : contexts.entrySet()) { - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - insCount += indelPileup.getNumberOfInsertions(); - delCount += indelPileup.getNumberOfDeletions(); - } - - if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) - return aList; - - for (Map.Entry sample : contexts.entrySet()) { - // todo -- warning, can be duplicating expensive partition here - AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - - final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); - - - for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) { - //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (read == null) - continue; - if (ReadUtils.is454Read(read)) { - continue; - } - -/* if (DEBUG && p.isIndel()) { - System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", - read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), - p.getEventLength(),p.getType().toString(), p.getEventBases()); - } - */ - - String indelString = p.getEventBases(); - if (p.isInsertion()) { - boolean foundKey = false; - // copy of hashmap into temp arrayList - ArrayList> cList = new ArrayList>(); - for (String s : consensusIndelStrings.keySet()) { - cList.add(new Pair(s,consensusIndelStrings.get(s))); - } - - if (read.getAlignmentEnd() == loc.getStart()) { - // first corner condition: a read has an insertion at the end, and we're right at the insertion. - // In this case, the read could have any of the inserted bases and we need to build a consensus - - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - // case 1: current insertion is prefix of indel in hash map - if (s.startsWith(indelString)) { - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.startsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - } - else if (read.getAlignmentStart() == loc.getStart()+1) { - // opposite corner condition: read will start at current locus with an insertion - for (int k=0; k < cList.size(); k++) { - String s = cList.get(k).getFirst(); - int cnt = cList.get(k).getSecond(); - if (s.endsWith(indelString)) { - // case 1: current insertion (indelString) is suffix of indel in hash map (s) - cList.set(k,new Pair(s,cnt+1)); - foundKey = true; - } - else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is prefix of current insertion - // In this case, new bases are new key. - foundKey = true; - cList.set(k,new Pair(indelString,cnt+1)); - } - } - if (!foundKey) - // none of the above: event bases not supported by previous table, so add new key - cList.add(new Pair(indelString,1)); - - - } - else { - // normal case: insertion somewhere in the middle of a read: add count to arrayList - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - cList.add(new Pair(indelString,cnt+1)); - } - - // copy back arrayList into hashMap - consensusIndelStrings.clear(); - for (Pair pair : cList) { - consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); - } - - } - else if (p.isDeletion()) { - indelString = String.format("D%d",p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; - consensusIndelStrings.put(indelString,cnt+1); - - } - } - - } - - Collection vcs = new ArrayList(); - int maxAlleleCnt = 0; - String bestAltAllele = ""; - - for (String s : consensusIndelStrings.keySet()) { - int curCnt = consensusIndelStrings.get(s), stop = 0; - // if observed count if above minimum threshold, we will genotype this allele - if (curCnt < minIndelCountForGenotyping) - continue; - - if (s.startsWith("D")) { - // get deletion length - int dLen = Integer.valueOf(s.substring(1)); - // get ref bases of accurate deletion - int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart(); - stop = loc.getStart() + dLen; - byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen); - - if (Allele.acceptableAlleleBases(refBases)) { - refAllele = Allele.create(refBases, true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - else continue; // don't go on with this allele if refBases are non-standard - } else { - // insertion case - if (Allele.acceptableAlleleBases(s)) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(s, false); - stop = loc.getStart(); - } - else continue; // go on to next allele if consensus insertion has any non-standard base. - } - - - ArrayList vcAlleles = new ArrayList(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(ref.getBase()); - builder.noGenotypes(); - if (doMultiAllelicCalls) { - vcs.add(builder.make()); - if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) - break; - } else if (curCnt > maxAlleleCnt) { - maxAlleleCnt = curCnt; - vcs.clear(); - vcs.add(builder.make()); - } - } - - if (vcs.isEmpty()) - return aList; // nothing else to do, no alleles passed minimum count criterion - - VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); - - aList = new ArrayList(mergedVC.getAlleles()); - - return aList; - + protected List computeConsensusAlleles(ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenomeLocParser locParser) { + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + return counter.computeConsensusAlleles(ref, contexts, contextType); } private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); @@ -289,7 +95,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenotypePriors priors, final List alternateAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser) { @@ -348,8 +153,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) return null; - if (!(priors instanceof DiploidIndelGenotypePriors)) - throw new StingException("Only diploid-based Indel priors are supported in the INDEL GL model"); if (alleleList.isEmpty()) return null; @@ -370,7 +173,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1; + final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; if (hsize <= 0) { @@ -395,26 +198,23 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); - ReadBackedPileup pileup = null; - if (context.hasExtendedEventPileup()) - pileup = context.getExtendedEventPileup(); - else if (context.hasBasePileup()) - pileup = context.getBasePileup(); + if (context.hasBasePileup()) { + final ReadBackedPileup pileup = context.getBasePileup(); + if (pileup != null) { + final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); - if (pileup != null) { - final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); + genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); - HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); - attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); - - if (DEBUG) { - System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); - for (int k = 0; k < genotypeLikelihoods.length; k++) - System.out.format("%1.4f ", genotypeLikelihoods[k]); - System.out.println(); + if (DEBUG) { + System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString()); + for (int k = 0; k < genotypeLikelihoods.length; k++) + System.out.format("%1.4f ", genotypeLikelihoods[k]); + System.out.println(); + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 23d4e4905..3088cf9d2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -30,10 +30,13 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -59,21 +62,18 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenotypePriors priors, final List alternateAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser) { - if ( !(priors instanceof DiploidSNPGenotypePriors) ) - throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); - final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); + final Allele refAllele = Allele.create(refBase, true); // start making the VariantContext final GenomeLoc loc = ref.getLocus(); final List alleles = new ArrayList(); - alleles.add(Allele.create(refBase, true)); + alleles.add(refAllele); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); // calculate the GLs @@ -84,7 +84,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC pileup = createBAQedPileup( pileup ); // create the GenotypeLikelihoods object - final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); + final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(UAC.PCR_error); final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); if ( nGoodBases > 0 ) GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); @@ -99,7 +99,11 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // ignore places where we don't have a SNP if ( vc == null || !vc.isSNP() ) return null; - + + // make sure a user isn't passing the REF base in as an ALT + if ( vc.hasAlternateAllele(refAllele, true) ) + throw new UserException.BadInput("Alternate allele '" + (char)refBase + "' passed in is the same as the reference at location " + vc.getChr() + ":" + vc.getStart()); + alleles.addAll(vc.getAlternateAlleles()); } else { @@ -135,7 +139,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC for ( int j = i; j <= numAltAlleles; j++ ) { // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. // In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc." - PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal(); + PLordering[(j * (j+1) / 2) + i] = DiploidGenotypeWithCorrectAlleleOrdering.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal(); } } @@ -167,7 +171,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC protected List determineAlternateAlleles(final byte ref, final List sampleDataList) { final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); - final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); + final int PLindexOfRef = DiploidGenotypeWithCorrectAlleleOrdering.createDiploidGenotype(ref, ref).ordinal(); for ( int i = 0; i < 4; i++ ) likelihoodSums[i] = 0.0; @@ -176,7 +180,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final double[] likelihoods = sampleData.GL.getLikelihoods(); final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); if ( PLindexOfBestGL != PLindexOfRef ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePairUsingDeprecatedOrdering(PLindexOfBestGL); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); if ( alleles.alleleIndex1 != baseIndexOfRef ) likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; // don't double-count it @@ -205,7 +209,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public class BAQedPileupElement extends PileupElement { public BAQedPileupElement( final PileupElement PE ) { - super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip()); + super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip()); } @Override @@ -215,10 +219,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC private static class SampleGenotypeData { public final String name; - public final DiploidSNPGenotypeLikelihoods GL; + public final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL; public final int depth; - public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) { + public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL, final int depth) { this.name = name; this.GL = GL; this.depth = depth; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java deleted file mode 100755 index 99d55bc69..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.commons.lang.NotImplementedException; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.security.cert.CertificateNotYetValidException; -import java.util.*; - -import org.broadinstitute.sting.utils.codecs.vcf.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 8/30/11 - * Time: 10:08 AM - * To change this template use File | Settings | File Templates. - */ -public class UGBoundAF extends RodWalker { - - @Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true) - VCFWriter writer; - - @Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true) - List> variants; - - private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0); - - private HashMap> epsilonPosteriorCache = new HashMap>(8192); - private HashMap logAC0Cache = new HashMap(8192); - private int QUANTIZATION_FACTOR = 1000; - - - public void initialize() { - Set allHeaderLines = new HashSet(1024); - for ( RodBinding v : variants ) { - String trackName = v.getName(); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set headerLines = new HashSet(vcfHeaders.get(trackName).getMetaData()); - } - allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+ - "frequency. First value=95% probability AF>x. Second value=95% probability AF allVariants = tracker.getValues(variants); - if ( allVariants.size() == 0 ) { - return null; - } - - List alternateAlleles = getAllAlternateAlleles(allVariants); - VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet())); - if ( alternateAlleles.size() > 1 ) { - logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString()); - alternateAlleles.add(allVariants.get(0).getReference()); - builder.alleles(alternateAlleles); - builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles))); - } else { - // get all the genotype likelihoods - GenotypesContext context = GenotypesContext.create(); - int numNoCall = 0; - for ( VariantContext v : allVariants ) { - numNoCall += v.getNoCallCount(); - context.addAll(v.getGenotypes()); - } - builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context))); - } - - return builder.make(); - } - - private List getAllAlternateAlleles(List variants) { - List alleles = new ArrayList(3); // some overhead - for ( VariantContext v : variants ) { - alleles.addAll(v.getAlternateAlleles()); - } - return alleles; - } - - @Override - public Integer reduce(VariantContext value, Integer sum) { - if ( value == null ) - return sum; - writer.add(value); - return ++sum; - } - - private int N_ITERATIONS = 1; - private double[] getACPosteriors(GenotypesContext gc) { - // note this uses uniform priors (!) - - double[][] zeroPriors = new double[1][1+2*gc.size()]; - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size()); - // todo -- allow multiple alleles here - for ( int i = 0; i < N_ITERATIONS; i ++ ) { - ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false); - } - - return result.log10AlleleFrequencyPosteriors[0]; - } - - private String boundAlleleFrequency(double[] ACLikelihoods) { - // note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs - // thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be - // agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over - - // want to pick a lower limit x and upper limit y such that - // int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95 - // int_{f=x to y} calculateAFPosterior(f) df = 0.95 - // and that (y-x) is minimized - - // this is done by quantizing [0,1] into small bins and, since the distribution is - // unimodal, greedily adding them until the probability is >= 0.95 - - throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented")); - } - - private double calculateAFPosterior(double[] likelihoods, double af) { - double[] probLiks = new double[likelihoods.length]; - for ( int c = 0; c < likelihoods.length; c++) { - probLiks[c] = calculateAFPosterior(c,likelihoods.length,af); - } - - return MathUtils.log10sumLog10(probLiks); - } - - private double calculateAFPosterior(int ac, int n, double af) { - // evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes - switch ( ac ) { - case 0: - return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af); - case 1: - return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT); - case 2: - return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT); - default: - return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac); - } - } - - private double logAC0Coef(int an) { - if ( ! logAC0Cache.containsKey(an) ) { - double coef = -Math.log10(EPS_LOWER_LIMIT); - for ( int k = 1; k <= an; k++ ) { - // note this should typically just be - // term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k - // but the 1-E term will just be 1, so we do the following to mitigate this problem - double binom = MathUtils.binomialCoefficient(an,k); - double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k); - double term = binom/k - Math.pow(eps_correction,k); - if ( k % 2 == 0 ) { - coef += term; - } else { - coef -= term; - } - } - - logAC0Cache.put(an,coef); - } - - return logAC0Cache.get(an); - } - - private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) { - double mid = (start + stop)/2; - double size = stop-start; - double fa = calculateAFPosterior(likelihoods,start); - double fb = calculateAFPosterior(likelihoods,mid); - double fc = calculateAFPosterior(likelihoods,stop); - double s = (size/6)*(fa + 4*fc + fb); - double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap); - return h; - } - - private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){ - if ( s == 0 ) - return -300.0; - double c = ( a + b )/2; - double h = b-a; - double d = (a + c)/2; - double e = (c + b)/2; - double fd = calculateAFPosterior(likelihoods, d); - double fe = calculateAFPosterior(likelihoods, e); - double s_l = (h/12)*(fa + 4*fd + fc); - double s_r = (h/12)*(fc + 4*fe + fb); - double s_2 = s_l + s_r; - if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){ - return Math.log10(s_2 + (s_2 - s)/15.0); - } - - return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1af8b8e8e..b33c036a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -38,14 +38,14 @@ public class UnifiedArgumentCollection { * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. */ @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) - public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; /** * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -53,15 +53,12 @@ public class UnifiedArgumentCollection { * effectively acts as a cap on the base qualities. */ @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) - public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; - /** - * Specifies how to determine the alternate allele to use for genotyping - */ - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; /** @@ -82,15 +79,22 @@ public class UnifiedArgumentCollection { public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; /** - * This argument is not enabled by default because it increases the runtime by an appreciable amount. + * Note that calculating the SLOD increases the runtime by an appreciable amount. */ @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; + /** + * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. + * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. + */ + @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) + public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false) + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) public RodBinding alleles; /** @@ -121,6 +125,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute + * to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end + * up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for + * low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over + * come this threshold. + */ + @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) + public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; + + /** * This argument informs the prior probability of having an indel at a site. */ @@ -129,11 +144,11 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty", required = false) - public double INDEL_GAP_CONTINUATION_PENALTY = 10.0; + public byte INDEL_GAP_CONTINUATION_PENALTY = 10; @Hidden @Argument(fullName = "indelGapOpenPenalty", shortName = "indelGOP", doc = "Indel gap open penalty", required = false) - public double INDEL_GAP_OPEN_PENALTY = 45.0; + public byte INDEL_GAP_OPEN_PENALTY = 45; @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) @@ -163,11 +178,13 @@ public class UnifiedArgumentCollection { uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; uac.NO_SLOD = NO_SLOD; + uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; + uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 6f8851017..3cec931d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; import java.util.*; @@ -115,8 +116,10 @@ import java.util.*; @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) +// TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: +// TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyper extends LocusWalker implements TreeReducible, AnnotatorCompatibleWalker { +public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatibleWalker { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -128,8 +131,19 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper public RodBinding getSnpEffRodBinding() { return null; } - public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } @@ -139,9 +153,11 @@ public class UnifiedGenotyper extends LocusWalker headerInfo = getHeaderInfo(); @@ -248,6 +258,8 @@ public class UnifiedGenotyper extends LocusWalker map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext); } @@ -300,44 +312,44 @@ public class UnifiedGenotyper extends LocusWalker calls, UGStatistics sum) { // we get a point for reaching reduce sum.nBasesVisited++; - // can't call the locus because of no coverage - if ( value == null ) - return sum; + boolean wasCallable = false; + boolean wasConfidentlyCalled = false; - // A call was attempted -- the base was potentially callable - sum.nBasesCallable++; + for ( VariantCallContext call : calls ) { + if ( call == null ) + continue; - // the base was confidently callable - sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0; + // A call was attempted -- the base was callable + wasCallable = true; - // can't make a call here - if ( !value.shouldEmit ) - return sum; + // was the base confidently callable? + wasConfidentlyCalled = call.confidentlyCalled; - try { - // we are actually making a call - sum.nCallsMade++; - writer.add(value); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); + if ( call.shouldEmit ) { + try { + // we are actually making a call + sum.nCallsMade++; + writer.add(call); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(e.getMessage()); + } + } } + if ( wasCallable ) + sum.nBasesCallable++; + + if ( wasConfidentlyCalled ) + sum.nBasesCalledConfidently++; + return sum; } public void onTraversalDone(UGStatistics sum) { - logger.info(String.format("Visited bases %d", sum.nBasesVisited)); - logger.info(String.format("Callable bases %d", sum.nBasesCallable)); - logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); - logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); - logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); - logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); - logger.info(String.format("Actual calls made %d", sum.nCallsMade)); - if ( metricsWriter != null ) { metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 7edcf61a2..caa3a6b6b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -36,19 +36,26 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; +import java.lang.reflect.Constructor; import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; + public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; + public enum OUTPUT_MODE { /** produces calls only at variant sites */ EMIT_VARIANTS_ONLY, @@ -60,9 +67,6 @@ public class UnifiedGenotyperEngine { EMIT_ALL_SITES } - protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - // the unified argument collection private final UnifiedArgumentCollection UAC; public UnifiedArgumentCollection getUAC() { return UAC; } @@ -71,7 +75,7 @@ public class UnifiedGenotyperEngine { private final VariantAnnotatorEngine annotationEngine; // the model used for calculating genotypes - private ThreadLocal> glcm = new ThreadLocal>(); + private ThreadLocal> glcm = new ThreadLocal>(); // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); @@ -81,12 +85,8 @@ public class UnifiedGenotyperEngine { private ThreadLocal posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[][] log10AlleleFrequencyPriorsSNPs; - private final double[][] log10AlleleFrequencyPriorsIndels; - - // the priors object - private final GenotypePriors genotypePriorsSNPs; - private final GenotypePriors genotypePriorsIndels; + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; // samples in input private final Set samples; @@ -95,7 +95,8 @@ public class UnifiedGenotyperEngine { private final Logger logger; private final PrintStream verboseWriter; - // number of chromosomes (2 * samples) in input + // number of chromosomes (ploidy * samples) in input + private final int ploidy; private final int N; // the standard filter to use for calls below the confidence threshold but above the emit threshold @@ -104,6 +105,7 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; + protected static final double SUM_GL_THRESH_NOCALL = VariantContextUtils.SUM_GL_THRESH_NOCALL; // --------------------------------------------------------------------------------------------------------- // @@ -112,29 +114,28 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader())); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size())); } - @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"}) - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples) { + @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples, int ploidy) { this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; genomeLocParser = toolkit.getGenomeLocParser(); this.samples = new TreeSet(samples); // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ - this.UAC = UAC.clone(); + this.UAC = UAC; this.logger = logger; this.verboseWriter = verboseWriter; this.annotationEngine = engine; - N = 2 * this.samples.size(); - log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; - log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + this.ploidy = ploidy; + this.N = samples.size() * ploidy; + log10AlleleFrequencyPriorsSNPs = new double[N+1]; + log10AlleleFrequencyPriorsIndels = new double[N+1]; computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); - genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); - genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); - + filter.add(LOW_QUAL_FILTER_NAME); } @@ -146,22 +147,28 @@ public class UnifiedGenotyperEngine { * @param rawContext contextual information around the locus * @return the VariantCallContext object */ - public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext ); - if( model == null ) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); + public List calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + final List results = new ArrayList(2); + + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { + results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); + } + else { + for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + if ( stratifiedContexts == null ) { + results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext) : null); + } + else { + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + if ( vc != null ) + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model)); + } + } } - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - if ( stratifiedContexts == null ) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext) : null); - } - - VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); - if ( vc == null ) - return null; - - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); + return results; } /** @@ -173,15 +180,20 @@ public class UnifiedGenotyperEngine { * @return the VariantContext object */ public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( tracker, refContext, rawContext ); - if( model == null ) + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { return null; + } - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - if ( stratifiedContexts == null ) - return null; + for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + // return the first valid one we encounter + if ( stratifiedContexts != null ) + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + } + + return null; } /** @@ -194,17 +206,18 @@ public class UnifiedGenotyperEngine { * @return the VariantCallContext object */ public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext ); - if( model == null ) { + final List models = getGLModelsToUse(tracker, refContext, rawContext); + if ( models.isEmpty() ) { return null; } - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + + // return the first one + final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); + final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); } - - // --------------------------------------------------------------------------------------------------------- // // Private implementation helpers @@ -219,7 +232,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -239,13 +252,9 @@ public class UnifiedGenotyperEngine { vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); } - if ( annotationEngine != null ) { + if ( annotationEngine != null && rawContext.hasBasePileup() ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - ReadBackedPileup pileup = null; - if (rawContext.hasExtendedEventPileup()) - pileup = rawContext.getExtendedEventPileup(); - else if (rawContext.hasBasePileup()) - pileup = rawContext.getBasePileup(); + final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); @@ -265,8 +274,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); - posteriorsArray.set(new double[N + 2]); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); + posteriorsArray.set(new double[2]); } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); @@ -275,13 +284,11 @@ public class UnifiedGenotyperEngine { if ( limitedContext ) return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? - estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), false, 1.0) : + estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? @@ -296,12 +303,11 @@ public class UnifiedGenotyperEngine { // the genotyping model may have stripped it out if ( indexOfAllele == -1 ) continue; - - int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); - // if the most likely AC is not 0, then this is a good alternate allele to use; - // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array - if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1]; + + // if the most likely AC is not 0, then this is a good alternate allele to use + if ( indexOfBestAC != 0 ) { myAlleles.add(alternateAllele); bestGuessIsRef = false; } @@ -312,7 +318,6 @@ public class UnifiedGenotyperEngine { } // calculate p(f>0): - // because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); final double PofF = 1.0 - normalizedPosteriors[0]; @@ -320,18 +325,11 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; + phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - double sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; - if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += AFresult.log10AlleleFrequencyPosteriors[0][i]; - } + final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } @@ -340,7 +338,7 @@ public class UnifiedGenotyperEngine { if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF); } // start constructing the resulting VC @@ -356,11 +354,11 @@ public class UnifiedGenotyperEngine { } // create the genotypes - final GenotypesContext genotypes = subsetAlleles(vc, myAlleles, true); + final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); + printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -369,34 +367,35 @@ public class UnifiedGenotyperEngine { if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) + attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size()); + if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List alternateAllelesToUse = builder.make().getAlternateAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; - double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; - double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -418,13 +417,14 @@ public class UnifiedGenotyperEngine { builder.attributes(attributes); VariantContext vcCall = builder.make(); - if ( annotationEngine != null && !limitedContext ) { + // if we are subsetting alleles (either because there were too many or because some were not polymorphic) + // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). + if ( myAlleles.size() != vc.getAlleles().size() ) + vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); + + if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations - ReadBackedPileup pileup = null; - if (rawContext.hasExtendedEventPileup()) - pileup = rawContext.getExtendedEventPileup(); - else if (rawContext.hasBasePileup()) - pileup = rawContext.getBasePileup(); + final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); @@ -433,60 +433,41 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; - System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); + public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { + normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); + normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); return MathUtils.normalizeFromLog10(normalizedPosteriors); } private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { - Map stratifiedContexts = null; - - if ( !BaseUtils.isRegularBase( refContext.getBase() ) ) + if ( !BaseUtils.isRegularBase(refContext.getBase()) || !rawContext.hasBasePileup() ) return null; - if ( model == GenotypeLikelihoodsCalculationModel.Model.INDEL ) { + Map stratifiedContexts = null; - if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { - // regular pileup in this case - ReadBackedPileup pileup = rawContext.getBasePileup() .getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); + if ( model.name().contains("INDEL") ) { - // don't call when there is no coverage - if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) - return null; + final ReadBackedPileup pileup = rawContext.getBasePileup().getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); + // don't call when there is no coverage + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + return null; - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - } else { - - // todo - tmp will get rid of extended events so this wont be needed - if (!rawContext.hasExtendedEventPileup()) - return null; - ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); - - // filter the context based on min mapping quality - ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); - - // don't call when there is no coverage - if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) - return null; - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - } - } else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) { + } else if ( model.name().contains("SNP") ) { // stratify the AlignmentContext and cut by sample stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup()); - if( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { + if ( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) { int numDeletions = 0; - for( final PileupElement p : rawContext.getBasePileup() ) { - if( p.isDeletion() ) { numDeletions++; } + for ( final PileupElement p : rawContext.getBasePileup() ) { + if ( p.isDeletion() ) + numDeletions++; } - if( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { + if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { return null; } } @@ -495,14 +476,6 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - protected static void clearAFarray(double[][] AFs) { - for ( int i = 0; i < AFs.length; i++ ) { - for ( int j = 0; j < AFs[i].length; j++ ) { - AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } - } - private final static double[] binomialProbabilityDepthCache = new double[10000]; static { for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { @@ -533,12 +506,10 @@ public class UnifiedGenotyperEngine { int depth = 0; - if (isCovered) { + if ( isCovered ) { AlignmentContext context = contexts.get(sample); - if (context.hasBasePileup()) + if ( context.hasBasePileup() ) depth = context.getBasePileup().depthOfCoverage(); - else if (context.hasExtendedEventPileup()) - depth = context.getExtendedEventPileup().depthOfCoverage(); } P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth); @@ -547,7 +518,7 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); } - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) { + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { Allele refAllele = null, altAllele = null; for ( Allele allele : vc.getAlleles() ) { if ( allele.isReference() ) @@ -570,11 +541,8 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) - AFline.append("0.00000000\t"); - else - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); - AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE())); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP())); verboseWriter.println(AFline.toString()); } @@ -596,119 +564,123 @@ public class UnifiedGenotyperEngine { (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); } - // decide whether we are currently processing SNPs, indels, or neither - private GenotypeLikelihoodsCalculationModel.Model getCurrentGLModel(final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext rawContext ) { - if (rawContext.hasExtendedEventPileup() ) { - // todo - remove this code - if ((UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) && - (UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) ) - return GenotypeLikelihoodsCalculationModel.Model.INDEL; - } - else { - // no extended event pileup + // decide whether we are currently processing SNPs, indels, neither, or both + private List getGLModelsToUse(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext) { + + final List models = new ArrayList(2); + + if ( rawContext.hasBasePileup() ) { // if we're genotyping given alleles and we have a requested SNP at this position, do SNP - if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { - VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); - if (vcInput == null) - return null; + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); + if ( vcInput == null ) + return models; - // todo - no support to genotype MNP's yet - if (vcInput.isMNP()) - return null; - - if (vcInput.isSNP()) { - if (( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP)) - return GenotypeLikelihoodsCalculationModel.Model.SNP; - else - // ignore SNP's if user chose INDEL mode - return null; + if ( vcInput.isSNP() ) { + // ignore SNPs if the user chose INDEL mode only + if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) + models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); + else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") ) + models.add(UAC.GLmodel); } - else if ((vcInput.isIndel() || vcInput.isMixed()) && (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL)) - return GenotypeLikelihoodsCalculationModel.Model.INDEL; + else if ( vcInput.isIndel() || vcInput.isMixed() ) { + // ignore INDELs if the user chose SNP mode only + if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) + models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); + else if (UAC.GLmodel.name().toUpperCase().contains("INDEL")) + models.add(UAC.GLmodel); + } + // No support for other types yet } else { - // todo - this assumes SNP's take priority when BOTH is selected, should do a smarter way once extended events are removed - if( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP) - return GenotypeLikelihoodsCalculationModel.Model.SNP; - else if (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) - return GenotypeLikelihoodsCalculationModel.Model.INDEL; + if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) { + models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); + models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); + } + else { + models.add(UAC.GLmodel); + } } } - return null; + + return models; } - protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { + protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { - // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i - for (int alleles = 1; alleles <= priors.length; alleles++) { - double sum = 0.0; + double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - double value = Math.pow(theta, alleles) / (double)i; - priors[alleles-1][i] = Math.log10(value); - sum += value; + // for each i + for (int i = 1; i <= N; i++) { + final double value = theta / (double)i; + priors[i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); + } + + protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + if (model.name().toUpperCase().contains("SNP")) + return log10AlleleFrequencyPriorsSNPs; + else if (model.name().toUpperCase().contains("INDEL")) + return log10AlleleFrequencyPriorsIndels; + else + throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + + } + + protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { + if( model.name().contains("SNP") ) + return HUMAN_SNP_HETEROZYGOSITY; + if( model.name().contains("INDEL") ) + return HUMAN_INDEL_HETEROZYGOSITY; + else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); + } + + private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { + + final Map glcm = new HashMap(); + final List> glmClasses = new PluginManager(GenotypeLikelihoodsCalculationModel.class).getPlugins(); + + for (int i = 0; i < glmClasses.size(); i++) { + final Class glmClass = glmClasses.get(i); + final String key = glmClass.getSimpleName().replaceAll("GenotypeLikelihoodsCalculationModel","").toUpperCase(); + try { + final Object args[] = new Object[]{UAC,logger}; + final Constructor c = glmClass.getDeclaredConstructor(UnifiedArgumentCollection.class, Logger.class); + glcm.put(key, (GenotypeLikelihoodsCalculationModel)c.newInstance(args)); } + catch (Exception e) { + throw new UserException("The likelihoods model provided for the -glm argument (" + UAC.GLmodel + ") is not a valid option: " + e.getMessage()); + } + } - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[alleles-1][0] = Math.log10(1.0 - sum); - } - } - - protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { - switch( model ) { - case SNP: - return log10AlleleFrequencyPriorsSNPs; - case INDEL: - return log10AlleleFrequencyPriorsIndels; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - } - - private static GenotypePriors createGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) { - GenotypePriors priors; - switch ( model ) { - case SNP: - // use flat priors for GLs - priors = new DiploidSNPGenotypePriors(); - break; - case INDEL: - // create flat priors for Indels, actual priors will depend on event length to be genotyped - priors = new DiploidIndelGenotypePriors(); - break; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - return priors; - } - - protected GenotypePriors getGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) { - switch( model ) { - case SNP: - return genotypePriorsSNPs; - case INDEL: - return genotypePriorsIndels; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - } - - private static Map getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { - Map glcm = new HashMap(); - glcm.put(GenotypeLikelihoodsCalculationModel.Model.SNP, new SNPGenotypeLikelihoodsCalculationModel(UAC, logger)); - glcm.put(GenotypeLikelihoodsCalculationModel.Model.INDEL, new IndelGenotypeLikelihoodsCalculationModel(UAC, logger)); return glcm; } private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { - AlleleFrequencyCalculationModel afcm; - switch ( UAC.AFmodel ) { - case EXACT: - afcm = new ExactAFCalculationModel(UAC, N, logger, verboseWriter); - break; - default: throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); - } + List> afClasses = new PluginManager(AlleleFrequencyCalculationModel.class).getPlugins(); - return afcm; + for (int i = 0; i < afClasses.size(); i++) { + Class afClass = afClasses.get(i); + String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase(); + if (UAC.AFmodel.name().equalsIgnoreCase(key)) { + try { + Object args[] = new Object[]{UAC,N,logger,verboseWriter}; + Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class); + + return (AlleleFrequencyCalculationModel)c.newInstance(args); + } + catch (Exception e) { + throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); + } + } + } + throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); } public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { @@ -717,7 +689,7 @@ public class UnifiedGenotyperEngine { VariantContext vc = null; // search for usable record - for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { + for ( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { if ( vc == null ) { vc = vc_input; @@ -729,116 +701,4 @@ public class UnifiedGenotyperEngine { return vc; } - - /** - * @param vc variant context with genotype likelihoods - * @return genotypes - */ - public static GenotypesContext assignGenotypes(final VariantContext vc) { - return subsetAlleles(vc, vc.getAlleles(), true); - } - - /** - * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes true if we should change the genotypes based on the (subsetted) PLs - * @return genotypes - */ - public static GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes) { - - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int numNewAltAlleles = allelesToUse.size() - 1; - - // which PLs should be carried forward? - ArrayList likelihoodIndexesToUse = null; - - // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, - // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { - likelihoodIndexesToUse = new ArrayList(30); - - final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) - altAlleleIndexToUse[i] = true; - } - - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles); - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) - likelihoodIndexesToUse.add(PLindex); - } - } - - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - if ( !g.hasLikelihoods() ) { - newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { - newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); - } - else { - Map attrs = new HashMap(g.getAttributes()); - if ( numNewAltAlleles == 0 ) - attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); - else - attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) - newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false)); - else - newGTs.add(assignGenotype(g, newLikelihoods, allelesToUse, numNewAltAlleles, attrs)); - } - } - - return newGTs; - } - - protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final int numNewAltAlleles, final Map attrs) { - // find the genotype with maximum likelihoods - int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - ArrayList myAlleles = new ArrayList(); - myAlleles.add(allelesToUse.get(alleles.alleleIndex1)); - myAlleles.add(allelesToUse.get(alleles.alleleIndex2)); - - final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); - return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 64993b43a..bcb9ea591 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -31,7 +31,9 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,6 +43,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { @@ -60,12 +63,12 @@ public class PairHMMIndelErrorModel { private static final int START_HRUN_GAP_IDX = 4; private static final int MAX_HRUN_GAP_IDX = 20; - private static final double MIN_GAP_OPEN_PENALTY = 30.0; - private static final double MIN_GAP_CONT_PENALTY = 10.0; - private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this. + private static final byte MIN_GAP_OPEN_PENALTY = 30; + private static final byte MIN_GAP_CONT_PENALTY = 10; + private static final byte GAP_PENALTY_HRUN_STEP = 1; // each increase in hrun decreases gap penalty by this. - private final double[] GAP_OPEN_PROB_TABLE; - private final double[] GAP_CONT_PROB_TABLE; + private final byte[] GAP_OPEN_PROB_TABLE; + private final byte[] GAP_CONT_PROB_TABLE; ///////////////////////////// // Private Member Variables @@ -86,42 +89,42 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { this.DEBUG = deb; this.bandedLikelihoods = bandedLikelihoods; // fill gap penalty table, affine naive model: - this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; - this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; + this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; + this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - double gop = -indelGOP/10.0; - double gcp = -indelGCP/10.0; for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = gop; - GAP_CONT_PROB_TABLE[i] = gcp; + GAP_OPEN_PROB_TABLE[i] = indelGOP; + GAP_CONT_PROB_TABLE[i] = indelGCP; } double step = GAP_PENALTY_HRUN_STEP/10.0; - double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob - double maxGCP = -MIN_GAP_CONT_PENALTY/10.0; // phred to log prob + // initialize gop and gcp to their default values + byte gop = indelGOP; + byte gcp = indelGCP; + // all of the following is computed in QUal-space for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { - gop += step; - if (gop > maxGOP) - gop = maxGOP; + gop -= GAP_PENALTY_HRUN_STEP; + if (gop < MIN_GAP_OPEN_PENALTY) + gop = MIN_GAP_OPEN_PENALTY; - gcp += step; - if(gcp > maxGCP) - gcp = maxGCP; + gcp -= step; + if(gcp < MIN_GAP_CONT_PENALTY) + gcp = MIN_GAP_CONT_PENALTY; GAP_OPEN_PROB_TABLE[i] = gop; GAP_CONT_PROB_TABLE[i] = gcp; } } - static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -154,203 +157,9 @@ public class PairHMMIndelErrorModel { } - private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, - double[] currentGOP, double[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { - if (indI > 0 && indJ > 0) { - final int im1 = indI -1; - final int jm1 = indJ - 1; - // update current point - final byte x = readBases[im1]; - final byte y = haplotypeBases[jm1]; - final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]); - - final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - - matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]}); - - final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - - XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); - - // update Y array - final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; - final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); - } - } - - private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, - double[] currentGOP, double[] currentGCP, int indToStart, - double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { - - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; - - if (indToStart == 0) { - // default initialization for all arrays - - for (int i=0; i < X_METRIC_LENGTH; i++) { - Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[i],Double.NEGATIVE_INFINITY); - } - - for (int i=1; i < X_METRIC_LENGTH; i++) { - //initialize first column - XMetricArray[i][0] = END_GAP_COST*(i); - } - - for (int j=1; j < Y_METRIC_LENGTH; j++) { - // initialize first row - YMetricArray[0][j] = END_GAP_COST*(j); - } - matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; - XMetricArray[0][0]= YMetricArray[0][0] = 0; - } - - - if (bandedLikelihoods) { - final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect. - - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1; - final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH); - - int idxWithMaxElement = 0; - - for (int diag=indToStart; diag < numDiags; diag++) { - // compute default I and J start positions at edge of diagonals - int indI = 0; - int indJ = diag; - if (diag >= Y_METRIC_LENGTH ) { - indI = diag-(Y_METRIC_LENGTH-1); - indJ = Y_METRIC_LENGTH-1; - } - - // first pass: from max element to edge - int idxLow = idxWithMaxElement; - - // reset diag max value before starting - double maxElementInDiag = Double.NEGATIVE_INFINITY; - // set indI, indJ to correct values - indI += idxLow; - indJ -= idxLow; - if (indI >= X_METRIC_LENGTH || indJ < 0) { - idxLow--; - indI--; - indJ++; - } - - - for (int el = idxLow; el < elemsInDiag; el++) { - updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, - currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - // update max in diagonal - final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); - - // check if we've fallen off diagonal value by threshold - if (bestMetric > maxElementInDiag) { - maxElementInDiag = bestMetric; - idxWithMaxElement = el; - } - else if (bestMetric < maxElementInDiag - DIAG_TOL && idxWithMaxElement > 0) - break; // done w/current diagonal - - indI++; - if (indI >=X_METRIC_LENGTH ) - break; - indJ--; - if (indJ <= 0) - break; - } - if (idxLow > 0) { - // now do second part in opposite direction - indI = 0; - indJ = diag; - if (diag >= Y_METRIC_LENGTH ) { - indI = diag-(Y_METRIC_LENGTH-1); - indJ = Y_METRIC_LENGTH-1; - } - - indI += idxLow-1; - indJ -= idxLow-1; - for (int el = idxLow-1; el >= 0; el--) { - - updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, - currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - // update max in diagonal - final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); - - // check if we've fallen off diagonal value by threshold - if (bestMetric > maxElementInDiag) { - maxElementInDiag = bestMetric; - idxWithMaxElement = el; - } - else if (bestMetric < maxElementInDiag - DIAG_TOL) - break; // done w/current diagonal - - indJ++; - if (indJ >= Y_METRIC_LENGTH ) - break; - indI--; - if (indI <= 0) - break; - } - } - // if (DEBUG) - // System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement); - } - } - else { - // simplified rectangular version of update loop - for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - for (int indJ=indToStart+1; indJ < Y_METRIC_LENGTH; indJ++) { - updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, - currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - - } - } - } - - - - final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; - final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] }); - - /* - if (DEBUG) { - PrintStream outx, outy, outm, outs; - double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - try { - outx = new PrintStream("datax.txt"); - outy = new PrintStream("datay.txt"); - outm = new PrintStream("datam.txt"); - outs = new PrintStream("datas.txt"); - double metrics[] = new double[3]; - for (int indI=0; indI < X_METRIC_LENGTH; indI++) { - for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) { - metrics[0] = matchMetricArray[indI][indJ]; - metrics[1] = XMetricArray[indI][indJ]; - metrics[2] = YMetricArray[indI][indJ]; - //sumMetrics[indI][indJ] = MathUtils.softMax(metrics); - outx.format("%4.1f ", metrics[1]); - outy.format("%4.1f ", metrics[2]); - outm.format("%4.1f ", metrics[0]); - outs.format("%4.1f ", MathUtils.softMax(metrics)); - } - outx.println(); outm.println();outy.println(); outs.println(); - } - outm.close(); outx.close(); outy.close(); - } catch (java.io.IOException e) { throw new UserException("bla");} - } - */ - - return bestMetric; - - } - - private void fillGapProbabilities(int[] hrunProfile, - double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) { + private void fillGapProbabilities(final int[] hrunProfile, + final byte[] contextLogGapOpenProbabilities, + final byte[] contextLogGapContinuationProbabilities) { // fill based on lookup table for (int i = 0; i < hrunProfile.length; i++) { if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) { @@ -372,27 +181,8 @@ public class PairHMMIndelErrorModel { final int readCounts[] = new int[pileup.getNumberOfElements()]; int readIdx=0; - LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); - LinkedHashMap gapContProbabilityMap = new LinkedHashMap(); - - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. - // todo -- refactor into separate function - for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - byte[] haplotypeBases = haplotype.getBases(); - double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; - double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; - - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); - gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); - - } + PairHMM pairHMM = new PairHMM(bandedLikelihoods); for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations readCounts[readIdx] = p.getRepresentativeCount(); @@ -406,14 +196,32 @@ public class PairHMMIndelErrorModel { } } else { + if (DEBUG) { + System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); + } // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); + if (read.isEmpty()) continue; - if(ReadUtils.is454Read(read)) { + if (read.getUnclippedEnd() > ref.getWindow().getStop()) + read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop()); + + if (read.isEmpty()) continue; - } + + if (read.getUnclippedStart() < ref.getWindow().getStart()) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart()); + + if (read.isEmpty()) + continue; + // hard-clip low quality ends - this may introduce extra H elements in CIGAR string + read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD ); + + if (read.isEmpty()) + continue; + // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -469,54 +277,56 @@ public class PairHMMIndelErrorModel { unclippedReadBases = read.getReadBases(); unclippedReadQuals = read.getBaseQualities(); - // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative, - // and may leave a string of Q2 bases still hanging off the reads. - for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) { - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numStartClippedBases++; - else - break; + final int extraOffset = Math.abs(eventLength); - } - for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){ - if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD) - numEndClippedBases++; - else - break; - } + /** + * Compute genomic locations that candidate haplotypes will span. + * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord, + * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above. + * We will propose haplotypes that overlap the read with some padding. + * True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read) + * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart. + * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to + * differentiate context between two haplotypes + */ + long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); + long stopLocationInRefForHaplotypes = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - int extraOffset = Math.abs(eventLength); + if (DEBUG) + System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); - long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); - long stop = readEnd -numEndClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; - - // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from. int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases; // check if start of read will be before start of reference context - if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut - start = ref.getWindow().getStart(); - + if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { + // read starts before haplotype: read will have to be cut + //numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; + startLocationInRefForHaplotypes = ref.getWindow().getStart(); + } // check also if end of read will go beyond reference context - if (stop > ref.getWindow().getStop()) - stop = ref.getWindow().getStop(); + if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { + //numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop(); + stopLocationInRefForHaplotypes = ref.getWindow().getStop(); + } // if there's an insertion in the read, the read stop position will be less than start + read legnth, // but we want to compute likelihoods in the whole region that a read might overlap - if (stop <= start + readLength) { - stop = start + readLength-1; + if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) { + stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1; } // ok, we now figured out total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against - /* - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - */ + + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); LinkedHashMap readEl = new LinkedHashMap(); + /** + * Check if we'll end up with an empty read once all clipping is done + */ if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { @@ -537,69 +347,67 @@ public class PairHMMIndelErrorModel { // initialize path metric and traceback memories for likelihood computation double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; - double[] previousGOP = null; - double[] previousGCP = null; - int startIdx; + final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; + final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; + + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[readBases.length]; + getContextHomopolymerLength(readBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - if (stop > haplotype.getStopPosition()) - stop = haplotype.getStopPosition(); - if (start < haplotype.getStartPosition()) - start = haplotype.getStartPosition(); + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); - // cut haplotype bases - long indStart = start - haplotype.getStartPosition(); - long indStop = stop - haplotype.getStartPosition(); + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); double readLikelihood; if (DEBUG) System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength(), read.getCigar().toString()); + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); + + - if (indStart < 0 || indStop >= haplotype.getBases().length || indStart > indStop) { - // read spanned more than allowed reference context: we currently can't deal with this - readLikelihood =0; - } else - { final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - if (matchMetricArray == null) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; + final int X_METRIC_LENGTH = readBases.length+2; + final int Y_METRIC_LENGTH = haplotypeBases.length+2; + if (matchMetricArray == null) { + //no need to reallocate arrays for each new haplotype, as length won't change matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); } - final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - if (previousHaplotypeSeen == null) - startIdx = 0; - else { - final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); - final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); - startIdx = Math.min(Math.min(s1, s2), s3); - } + + int startIndexInHaplotype = 0; + if (previousHaplotypeSeen != null) + startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - previousGOP = currentContextGOP.clone(); - previousGCP = currentContextGCP.clone(); + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + contextLogGapOpenProbabilities, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities, + startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, - currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); if (DEBUG) { System.out.println("H:"+new String(haplotypeBases)); System.out.println("R:"+new String(readBases)); System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIdx); + System.out.format("StPos:%d\n", startIndexInHaplotype); } - } readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 424e05c20..a831ec0a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -31,18 +31,13 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.BadCigarFilter; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -99,7 +94,7 @@ import java.util.TreeSet; * * @author ebanks */ -@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, BadCigarFilter.class}) +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, BadMateFilter.class, Platform454Filter.class, BadCigarFilter.class}) @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) @@ -142,16 +137,17 @@ public class RealignerTargetCreator extends RodWalker 0.0 && mismatchThreshold <= 1.0; } public Event map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -162,17 +158,6 @@ public class RealignerTargetCreator extends RodWalker 0 ) { - hasIndel = hasInsertion = true; - // check the ends of the reads to see how far they extend - for (ExtendedEventPileupElement p : pileup.toExtendedIterable() ) - furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd()); - } - } - // look at the rods for indels or SNPs if ( tracker != null ) { for ( VariantContext vc : tracker.getValues(known) ) { @@ -201,24 +186,24 @@ public class RealignerTargetCreator extends RodWalker 0.0 && - mismatchThreshold <= 1.0 && + if ( lookForMismatchEntropy && pileup.getNumberOfElements() >= minReadsAtLocus && (double)mismatchQualities / (double)totalQualities >= mismatchThreshold ) hasPointEvent = true; @@ -244,8 +228,6 @@ public class RealignerTargetCreator extends RodWalker samplesToPhase = null; + protected Set samplesToPhase = null; + + @Hidden + @Argument(fullName = "permitNoSampleOverlap", shortName = "permitNoSampleOverlap", doc = "Don't exit (just WARN) when the VCF and BAMs do not overlap in samples", required = false) + private boolean permitNoSampleOverlap = false; + + @Argument(fullName = "respectPhaseInInput", shortName = "respectPhaseInInput", doc = "Will only phase genotypes in cases where the resulting output will necessarily be consistent with any existing phase (for example, from trios)", required = false) + private boolean respectPhaseInInput = false; private GenomeLoc mostDownstreamLocusReached = null; @@ -205,12 +213,18 @@ public class ReadBackedPhasingWalker extends RodWalker rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set samples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); - writer.writeHeader(new VCFHeader(hInfo, samples)); - } + Set vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); + writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); - public boolean generateExtendedEvents() { - return false; + Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + readSamples.retainAll(vcfSamples); + if (readSamples.isEmpty()) { + String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; + if (permitNoSampleOverlap) + logger.warn(noPhaseString); + else + throw new UserException(noPhaseString); + } } public PhasingStats reduceInit() { @@ -257,9 +271,6 @@ public class ReadBackedPhasingWalker extends RodWalker readsAtHetSites = null; + private void clearFields() { + hetGenotypes = null; + prevHetAndInteriorIt = null; + phasingSiteIndex = -1; + readsAtHetSites = null; + } + public boolean hasPreviousHets() { return phasingSiteIndex > 0; } @@ -498,12 +516,20 @@ public class ReadBackedPhasingWalker extends RodWalker + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountReads \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountBasesWalker extends ReadWalker { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + + return read.getReadLength(); + } + + public Long reduceInit() { return 0L; } + + public Long reduce(Integer value, Long sum) { + return (long) value + sum; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java new file mode 100755 index 000000000..c5ab0426d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads ending in each category. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReadEndIndels \
+ *   -o output.grp \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ */ + + +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountReadEventsWalker extends ReadWalker> , Map>> { + @Output (doc = "GATKReport table output") + PrintStream out; + + public Map> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + return ReadUtils.getCigarOperatorForAllBases(read); + } + + public Map> reduceInit() { + return new HashMap>(); + } + + public Map> reduce(Map> value, Map> sum) { + for (Map.Entry> entry : value.entrySet()) { + CigarOperator op = entry.getKey(); + ArrayList positions = entry.getValue(); + + for (int p : positions) { + Map operatorCount = sum.get(p); + if (operatorCount == null) { + operatorCount = new HashMap(); + sum.put(p, operatorCount); + } + + Long count = operatorCount.get(op); + if (count == null) + count = 0L; + count++; + operatorCount.put(op, count); + } + } + return sum; + } + + @Override + public void onTraversalDone(Map> result) { + GATKReport report = GATKReport.newSimpleReport("Events", "Position", "Event", "Observations"); + for (Map.Entry> entry : result.entrySet()) { + int position = entry.getKey(); + Map operatorCount = entry.getValue(); + + for (Map.Entry subEntry: operatorCount.entrySet()) { + String operator = subEntry.getKey().name(); + Long observations = subEntry.getValue(); + report.addRow(position, operator, observations); + } + } + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java new file mode 100755 index 000000000..9208cbae8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java @@ -0,0 +1,70 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.List; + +/** + * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads ending in each category. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReadEndIndels \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountTerminusEventWalker extends ReadWalker, Pair> { + public Pair map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + List cigarElements = read.getCigar().getCigarElements(); + + CigarElement lastElement = null; + for (CigarElement element : cigarElements) { + if (element.getOperator() != CigarOperator.HARD_CLIP) + lastElement = element; + } + + if (lastElement == null) + throw new UserException.MalformedBAM(read, "read does not have any bases, it's all hard clips"); + + long endsInIndel = lastElement.getOperator() == CigarOperator.INSERTION || lastElement.getOperator() == CigarOperator.DELETION? 1 : 0; + long endsInSC = lastElement.getOperator() == CigarOperator.SOFT_CLIP ? 1 : 0; + + return new Pair(endsInIndel, endsInSC); + } + + public Pair reduceInit() { return new Pair(0L, 0L); } + + public Pair reduce(Pair value, Pair sum) { + sum.set(sum.getFirst() + value.getFirst(), sum.getSecond() + value.getSecond()); + return sum; + } + + @Override + public void onTraversalDone(Pair result) { + System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java deleted file mode 100755 index ac0b3e7d5..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.Arrays; -import java.util.List; - -/** - * At each locus in the input data set, prints the reference base, genomic location, and - * all aligning reads in a compact but human-readable form. - */ -public class PrintLocusContextWalker extends LocusWalker implements TreeReducible { - @Output - private PrintStream out; - - public AlignmentContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - out.printf( "In map: ref = %s, loc = %s %s, reads = %s%n", ref.getBaseAsChar(), - context.getLocation(), - context.hasExtendedEventPileup() ? "[extended]" : "", - Arrays.deepToString( getReadNames(context.getReads()) ) ); - return context; - } - - - - public Integer reduceInit() { return 0; } - - public Integer reduce(AlignmentContext context, Integer sum) { - return sum + 1; - } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - private String[] getReadNames( List reads ) { - String[] readNames = new String[ reads.size() ]; - for( int i = 0; i < reads.size(); i++ ) { - readNames[i] = String.format("%nname = %s, start = %d, end = %d", reads.get(i).getReadName(), reads.get(i).getAlignmentStart(), reads.get(i).getAlignmentEnd()); - } - //Arrays.sort(readNames); - return readNames; - } - - @Override - public boolean generateExtendedEvents() { - return true; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/EmpiricalQual.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/EmpiricalQual.java new file mode 100755 index 000000000..e9bfa3513 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/EmpiricalQual.java @@ -0,0 +1,55 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: Mar 22, 2012 + * + * Object that holds the empirical quality and estimated reported quality values for on-the-fly recalibration. This is a simplification of the RecalDatum object + */ + +public class EmpiricalQual { + + private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations + private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + + private EmpiricalQual() {} + + public EmpiricalQual(final double estimatedQReported, final double empiricalQuality) { + this.estimatedQReported = estimatedQReported; + this.empiricalQuality = empiricalQuality; + } + + public final double getEstimatedQReported() { + return estimatedQReported; + } + + public final double getEmpiricalQuality() { + return empiricalQuality; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java index adc352b1b..aa9098549 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java @@ -109,4 +109,10 @@ public class RecalDatum extends RecalDatumOptimized { private double qualToErrorProb( final double qual ) { return Math.pow(10.0, qual / -10.0); } + + + @Override + public String toString() { + return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported())); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java index f370e2818..c985d26b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java @@ -379,12 +379,12 @@ public class GenotypeAndValidateWalker extends RodWalker { @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false) boolean onlyOutputValidAmplicons = false; + /** + * If ignoreComplexEvents is true, the output fasta file will contain only sequences coming from SNPs and Indels. + * Complex substitutions will be ignored. + */ + @Argument(doc="Ignore complex genomic records.",fullName="ignoreComplexEvents",required=false) + boolean ignoreComplexEvents = false; + /** * BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased. * This changes the size of the k-mer used for alignment. @@ -146,6 +153,7 @@ public class ValidationAmplicons extends RodWalker { StringBuilder rawSequence; boolean sequenceInvalid; boolean isSiteSNP; + boolean isSiteIndel; List invReason; int indelCounter; @@ -244,6 +252,7 @@ public class ValidationAmplicons extends RodWalker { } else if ( validate != null ) { // record variant type in case it's needed in output format isSiteSNP = (validate.isSNP()); + isSiteIndel = (validate.isIndel()); // doesn't matter if there's a mask here too -- this is what we want to validate if ( validate.isFiltered() ) { logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site."); @@ -504,6 +513,9 @@ public class ValidationAmplicons extends RodWalker { } + if (ignoreComplexEvents && !isSiteIndel && !isSiteSNP) + return; + if (!onlyOutputValidAmplicons || !sequenceInvalid) { String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); if (sequenomOutput) { @@ -512,7 +524,7 @@ public class ValidationAmplicons extends RodWalker { out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); } else if (ilmnOutput) { - String type = isSiteSNP?"SNP":"INDEL"; + String type = isSiteSNP?"SNP":(isSiteIndel?"INDEL":"OTHER"); seqIdentity = seqIdentity.replace("*",""); // no * in ref allele out.printf("%s,%s,%s,%s,%d,37,1000G,ExomePhase1,Forward,Plus,FALSE%n",probeName,type,seqIdentity,allelePos.getContig(),allelePos.getStart()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index ff3fe6506..3e48520a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.HashMap; -import java.util.List; -import java.util.Map; import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { - Map numAllelePriorMatrix = new HashMap(); + double[] flatPriors = null; double referenceLikelihood; public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); @@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector { // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? - double[][] flatPrior = createFlatPrior(vc.getAlleles()); - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + if ( flatPriors == null ) { + flatPriors = new double[1+2*samples.size()]; + } + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); // do we want to let this qual go up or down? if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { return true; @@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector { return false; } - - private double[][] createFlatPrior(List alleles) { - if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { - numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); - } - - return numAllelePriorMatrix.get(alleles.size()); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java new file mode 100644 index 000000000..8887e3c4f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval; + +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.AnalysisModuleScanner; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.PrintStream; +import java.lang.reflect.Field; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +/** + * Class for writing the GATKReport for VariantEval + * + * Accepts a fulled evaluated (i.e., there's no more data coming) set of stratifications and evaluators + * and supports writing out the data in these evaluators to a GATKReport. + */ +public class VariantEvalReportWriter { + private final GATKReport report; + private final StratificationManager stratManager; + + public VariantEvalReportWriter(final StratificationManager stratManager, + final Collection stratifiers, + final Collection evaluators) { + this.stratManager = stratManager; + this.report = initializeGATKReport(stratifiers, evaluators); + } + + /** + * The business end of the class. Writes out the data in the provided stratManager + * to the PrintStream out + * + * @param out + */ + public final void writeReport(final PrintStream out) { + for ( int key = 0; key < stratManager.size(); key++ ) { + final String stratStateString = stratManager.getStratsAndStatesStringForKey(key); + final List> stratsAndStates = stratManager.getStratsAndStatesForKey(key); + final EvaluationContext nec = stratManager.get(key); + + for ( final VariantEvaluator ve : nec.getVariantEvaluators() ) { + final GATKReportTable table = report.getTable(ve.getSimpleName()); + + final AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); + final Map datamap = scanner.getData(); + try { + if ( scanner.hasMoltenField() ) { + final Field field = scanner.getMoltenField(); + final Object fieldValue = field.get(ve); + + if ( fieldValue == null || ! (fieldValue instanceof Map) ) + throw new ReviewedStingException("BUG field " + field.getName() + " must be a non-null instance of Map in " + scanner.getAnalysis().name()); + final Map map = (Map)fieldValue; + if ( map.isEmpty() ) + throw new ReviewedStingException("BUG: map is null or empty in analysis " + scanner.getAnalysis()); + + int counter = 0; // counter is used to ensure printing order is as defined by entrySet + for ( Map.Entry keyValue : map.entrySet() ) { + // "%05d" is a terrible hack to ensure sort order + final String moltenStratStateString = stratStateString + String.format("%05d", counter++); + setStratificationColumns(table, moltenStratStateString, stratsAndStates); + table.set(moltenStratStateString, scanner.getMoltenAnnotation().variableName(), keyValue.getKey()); + table.set(moltenStratStateString, scanner.getMoltenAnnotation().valueName(), keyValue.getValue()); + } + } else { + setStratificationColumns(table, stratStateString, stratsAndStates); + for ( final Field field : datamap.keySet()) { + table.set(stratStateString, field.getName(), field.get(ve)); + } + } + } catch (IllegalAccessException e) { + throw new ReviewedStingException("BUG: analysis field not public: " + e); + } + } + } + + report.print(out); + } + + /** + * Common utility to configure a GATKReportTable columns + * + * Sets the column names to the strat names in stratsAndStates for the primary key in table + * + * @param table + * @param primaryKey + * @param stratsAndStates + */ + private void setStratificationColumns(final GATKReportTable table, + final String primaryKey, + final List> stratsAndStates) { + for ( final Pair stratAndState : stratsAndStates ) { + final VariantStratifier vs = stratAndState.getFirst(); + final String columnName = vs.getName(); + final Object strat = stratAndState.getSecond(); + if ( columnName == null || strat == null ) + throw new ReviewedStingException("Unexpected null variant stratifier state at " + table + " key = " + primaryKey); + table.set(primaryKey, columnName, strat); + } + } + + /** + * Initialize the output report + * + * We have a set of stratifiers and evaluation objects. We need to create tables that look like: + * + * strat1 strat2 ... stratN eval1.field1 eval1.field2 ... eval1.fieldM + * + * for each eval. + * + * Note that this procedure doesn't support the creation of the old TableType system. As the + * VariantEvaluators are effectively tables themselves, we require authors to just create new + * evaluation modules externally instead of allow them to embed them in other evaluation modules + * + * @return an initialized report object + */ + private GATKReport initializeGATKReport(final Collection stratifiers, + final Collection evaluators) { + final GATKReport report = new GATKReport(); + + for (final VariantEvaluator ve : evaluators) { + // create the table + final String tableName = ve.getSimpleName(); + final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description(); + report.addTable(tableName, tableDesc, true); + + // grab the table, and add the columns we need to it + final GATKReportTable table = report.getTable(tableName); + table.addPrimaryKey("entry", false); + table.addColumn(tableName, tableName); + + // first create a column to hold each stratifier state + for (final VariantStratifier vs : stratifiers) { + final String columnName = vs.getName(); + table.addColumn(columnName, null, vs.getFormat()); + } + + final AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); + final Map datamap = scanner.getData(); + + if ( scanner.hasMoltenField() ) { + // deal with molten data + table.addColumn(scanner.getMoltenAnnotation().variableName(), true, scanner.getMoltenAnnotation().variableFormat()); + table.addColumn(scanner.getMoltenAnnotation().valueName(), true, scanner.getMoltenAnnotation().valueFormat()); + } else { + if ( datamap.isEmpty() ) + throw new ReviewedStingException("Datamap is empty for analysis " + scanner.getAnalysis()); + + // add DataPoint's for each field marked as such + for (final Field field : datamap.keySet()) { + try { + field.setAccessible(true); + + // this is an atomic value, add a column for it + final String format = datamap.get(field).format(); + table.addColumn(field.getName(), true, format); + } catch (SecurityException e) { + throw new StingException("SecurityException: " + e); + } + } + } + } + + return report; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index d18c7e10a..a73bc2c70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -12,23 +12,23 @@ import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.DynamicStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.IntervalStratification; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.VariantEvalUtils; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; -import java.lang.reflect.Field; import java.util.*; /** @@ -93,6 +92,7 @@ import java.util.*; */ @Reference(window=@Window(start=-50, stop=50)) public class VariantEvalWalker extends RodWalker implements TreeReducible { + public static final String IS_SINGLETON_KEY = "ISSINGLETON"; @Output protected PrintStream out; @@ -117,6 +117,15 @@ public class VariantEvalWalker extends RodWalker implements Tr @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + /** + * Some analyses want to count overlap not with dbSNP (which is in general very open) but + * actually want to itemize their overlap specifically with a set of gold standard sites + * such as HapMap, OMNI, or the gold standard indels. Theis argument provides a mechanism + * for communicating which file to use + */ + @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false) + public RodBinding goldStandard = null; + // Help arguments @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false; @@ -154,10 +163,6 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false) protected Boolean NO_STANDARD_MODULES = false; - // Other arguments - @Argument(fullName="numSamples", shortName="ns", doc="Number of samples (used if no samples are available in the VCF file", required=false) - protected Integer NUM_SAMPLES = 0; - @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) protected double MIN_PHASE_QUALITY = 10.0; @@ -170,6 +175,9 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) private boolean requireStrictAlleleMatch = false; + @Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) + private boolean keepSitesWithAC0 = false; + /** * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. @@ -195,30 +203,27 @@ public class VariantEvalWalker extends RodWalker implements Tr private Set sampleNamesForEvaluation = new TreeSet(); private Set sampleNamesForStratification = new TreeSet(); - private int numSamples = 0; - - // The list of stratifiers and evaluators to use - private TreeSet stratificationObjects = null; - - // The set of all possible evaluation contexts - private HashMap evaluationContexts = null; // important stratifications private boolean byFilterIsEnabled = false; private boolean perSampleIsEnabled = false; - // Output report - private GATKReport report = null; - // Public constants private static String ALL_SAMPLE_NAME = "all"; + // the number of processed bp for this walker + long nProcessedLoci = 0; + // Utility class private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this); // Ancestral alignments private IndexedFastaSequenceFile ancestralAlignments = null; + // The set of all possible evaluation contexts + StratificationManager stratManager; + //Set dynamicStratifications = Collections.emptySet(); + /** * Initialize the stratifications, evaluations, evaluation contexts, and reporting object */ @@ -249,7 +254,6 @@ public class VariantEvalWalker extends RodWalker implements Tr // Load the sample list sampleNamesForEvaluation.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)); - numSamples = NUM_SAMPLES > 0 ? NUM_SAMPLES : sampleNamesForEvaluation.size(); if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) { sampleNamesForStratification.addAll(sampleNamesForEvaluation); @@ -263,9 +267,13 @@ public class VariantEvalWalker extends RodWalker implements Tr } // Initialize the set of stratifications and evaluations to use - stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); - Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); - for ( VariantStratifier vs : getStratificationObjects() ) { + // The list of stratifiers and evaluators to use + final List stratificationObjects = variantEvalUtils.initializeStratificationObjects(NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); + final Set> evaluationClasses = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); + + checkForIncompatibleEvaluatorsAndStratifiers(stratificationObjects, evaluationClasses); + + for ( VariantStratifier vs : stratificationObjects ) { if ( vs.getName().equals("Filter") ) byFilterIsEnabled = true; else if ( vs.getName().equals("Sample") ) @@ -283,10 +291,7 @@ public class VariantEvalWalker extends RodWalker implements Tr } // Initialize the evaluation contexts - evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); - - // Initialize report table - report = variantEvalUtils.initializeGATKReport(stratificationObjects, evaluationObjects); + createStratificationStates(stratificationObjects, evaluationClasses); // Load ancestral alignments if (ancestralAlignmentsFile != null) { @@ -297,13 +302,36 @@ public class VariantEvalWalker extends RodWalker implements Tr } } - // initialize CNVs if ( knownCNVsFile != null ) { knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile); } } + final void checkForIncompatibleEvaluatorsAndStratifiers( final List stratificationObjects, + Set> evaluationClasses) { + for ( final VariantStratifier vs : stratificationObjects ) { + for ( Class ec : evaluationClasses ) + if ( vs.getIncompatibleEvaluators().contains(ec) ) + throw new UserException.BadArgumentValue("ST and ET", + "The selected stratification " + vs.getName() + + " and evaluator " + ec.getSimpleName() + + " are incompatible due to combinatorial memory requirements." + + " Please disable one"); + } + } + + final void createStratificationStates(final List stratificationObjects, final Set> evaluationObjects) { + final List strats = new ArrayList(stratificationObjects); + stratManager = new StratificationManager(strats); + + logger.info("Creating " + stratManager.size() + " combinatorial stratification states"); + for ( int i = 0; i < stratManager.size(); i++ ) { + EvaluationContext ec = new EvaluationContext(this, evaluationObjects); + stratManager.set(i, ec); + } + } + public final Map> createIntervalTreeByContig(final IntervalBinding intervals) { final Map> byContig = new HashMap>(); @@ -325,15 +353,23 @@ public class VariantEvalWalker extends RodWalker implements Tr */ @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - for ( NewEvaluationContext nec : evaluationContexts.values() ) { - synchronized (nec) { - nec.update0(tracker, ref, context); - } + // we track the processed bp and expose this for modules instead of wasting CPU power on calculating + // the same thing over and over in evals that want the processed bp + synchronized (this) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); } if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); +// // update the dynamic stratifications +// for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) { +// // don't worry -- DynamicStratification only work with one eval object +// for ( final DynamicStratification ds : dynamicStratifications ) { +// ds.update(vc); +// } +// } + // --------- track --------- sample - VariantContexts - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); @@ -367,19 +403,7 @@ public class VariantEvalWalker extends RodWalker implements Tr // find the comp final VariantContext comp = findMatchingComp(eval, compSet); - HashMap> stateMap = new HashMap>(); - for ( VariantStratifier vs : stratificationObjects ) { - List states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); - stateMap.put(vs, states); - } - - ArrayList stateKeys = new ArrayList(); - variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); - - HashSet stateKeysHash = new HashSet(stateKeys); - - for ( StateKey stateKey : stateKeysHash ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); + for ( EvaluationContext nec : getEvaluationContexts(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) { // eval against the comp synchronized (nec) { @@ -406,6 +430,57 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } + /** + * Given specific eval and comp VCs and the sample name, return an iterable + * over all of the applicable state keys. + * + * this code isn't structured yet for efficiency. Here we currently are + * doing the following inefficient algorithm: + * + * for each strat: + * get list of relevant states that eval and comp according to strat + * add this list of states to a list of list states + * + * then + * + * ask the strat manager to look up all of the keys associated with the combinations + * of these states. For example, suppose we have a single variant S. We have active + * strats EvalRod, CompRod, and Novelty. We produce a list that looks like: + * + * L = [[Eval], [Comp], [All, Novel]] + * + * We then go through the strat manager tree to produce the keys associated with these states: + * + * K = [0, 1] where EVAL x COMP x ALL = 0 and EVAL x COMP x NOVEL = 1 + * + * It's clear that a better + * + * TODO -- create an inline version that doesn't create the intermediate list of list + * + * @param tracker + * @param ref + * @param eval + * @param evalName + * @param comp + * @param compName + * @param sampleName + * @return + */ + protected Collection getEvaluationContexts(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final VariantContext eval, + final String evalName, + final VariantContext comp, + final String compName, + final String sampleName ) { + final List> states = new LinkedList>(); + for ( final VariantStratifier vs : stratManager.getStratifiers() ) { + states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName)); + } + return stratManager.values(states); + } + + @Requires({"comp != null", "evals != null"}) private boolean compHasMatchingEval(final VariantContext comp, final Collection evals) { // find all of the matching comps @@ -454,7 +529,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if ( lenientMatch == null ) lenientMatch = comp; break; case NO_MATCH: - ; + // do nothing } } @@ -477,109 +552,23 @@ public class VariantEvalWalker extends RodWalker implements Tr */ public void onTraversalDone(Integer result) { logger.info("Finalizing variant report"); - - for ( StateKey stateKey : evaluationContexts.keySet() ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); - - for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { + + // go through the evaluations and finalize them + for ( final EvaluationContext nec : stratManager.values() ) + for ( final VariantEvaluator ve : nec.getVariantEvaluators() ) ve.finalizeEvaluation(); - - AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); - Map datamap = scanner.getData(); - - for (Field field : datamap.keySet()) { - try { - field.setAccessible(true); - - if (field.get(ve) instanceof TableType) { - TableType t = (TableType) field.get(ve); - - String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); - final DataPoint dataPointAnn = datamap.get(field); - - GATKReportTable table; - if (!report.hasTable(subTableName)) { - report.addTable(subTableName, dataPointAnn.description()); - table = report.getTable(subTableName); - - table.addPrimaryKey("entry", false); - table.addColumn(subTableName, subTableName); - - for ( VariantStratifier vs : stratificationObjects ) { - table.addColumn(vs.getName(), "unknown"); - } - - table.addColumn("row", "unknown"); - - for ( Object o : t.getColumnKeys() ) { - String c; - - if (o instanceof String) { - c = (String) o; - } else { - c = o.toString(); - } - - table.addColumn(c, 0.0); - } - } else { - table = report.getTable(subTableName); - } - - for (int row = 0; row < t.getRowKeys().length; row++) { - String r = (String) t.getRowKeys()[row]; - - for ( VariantStratifier vs : stratificationObjects ) { - final String columnName = vs.getName(); - table.set(stateKey.toString() + r, columnName, stateKey.get(columnName)); - } - - for (int col = 0; col < t.getColumnKeys().length; col++) { - String c; - if (t.getColumnKeys()[col] instanceof String) { - c = (String) t.getColumnKeys()[col]; - } else { - c = t.getColumnKeys()[col].toString(); - } - - String newStateKey = stateKey.toString() + r; - table.set(newStateKey, c, t.getCell(row, col)); - - table.set(newStateKey, "row", r); - } - } - } else { - GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); - - for ( VariantStratifier vs : stratificationObjects ) { - String columnName = vs.getName(); - - table.set(stateKey.toString(), columnName, stateKey.get(vs.getName())); - } - - table.set(stateKey.toString(), field.getName(), field.get(ve)); - } - } catch (IllegalAccessException e) { - throw new StingException("IllegalAccessException: " + e); - } - } - } - } - - report.print(out); + + final VariantEvalReportWriter writer = new VariantEvalReportWriter(stratManager, stratManager.getStratifiers(), stratManager.get(0).getVariantEvaluators()); + writer.writeReport(out); } // Accessors public Logger getLogger() { return logger; } - public int getNumSamples() { return numSamples; } - public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } - public TreeSet getStratificationObjects() { return stratificationObjects; } - public static String getAllSampleName() { return ALL_SAMPLE_NAME; } public List> getKnowns() { return knowns; } @@ -594,6 +583,10 @@ public class VariantEvalWalker extends RodWalker implements Tr public Set getJexlExpressions() { return jexlExpressions; } + public long getnProcessedLoci() { + return nProcessedLoci; + } + public Set getContigNames() { final TreeSet contigs = new TreeSet(); for( final SAMSequenceRecord r : getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) { @@ -602,11 +595,15 @@ public class VariantEvalWalker extends RodWalker implements Tr return contigs; } - public GenomeLocParser getGenomeLocParser() { - return getToolkit().getGenomeLocParser(); - } - + /** + * getToolkit is protected, so we have to pseudo-overload it here so eval / strats can get the toolkit + * @return + */ public GenomeAnalysisEngine getToolkit() { return super.getToolkit(); } + + public boolean ignoreAC0Sites() { + return ! keepSitesWithAC0; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 89d137ea9..c14754715 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -19,23 +19,23 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; */ @Analysis(description = "The overlap between eval and comp sites") public class CompOverlap extends VariantEvaluator implements StandardEval { - @DataPoint(description = "number of eval SNP sites") - long nEvalVariants = 0; + @DataPoint(description = "number of eval variant sites", format = "%d") + public long nEvalVariants = 0; - @DataPoint(description = "number of eval sites outside of comp sites") - long novelSites = 0; + @DataPoint(description = "number of eval sites outside of comp sites", format = "%d") + public long novelSites = 0; - @DataPoint(description = "number of eval sites at comp sites") - long nVariantsAtComp = 0; + @DataPoint(description = "number of eval sites at comp sites", format = "%d") + public long nVariantsAtComp = 0; @DataPoint(description = "percentage of eval sites at comp sites", format = "%.2f" ) - double compRate = 0.0; + public double compRate = 0.0; - @DataPoint(description = "number of concordant sites") - long nConcordant = 0; + @DataPoint(description = "number of concordant sites", format = "%d") + public long nConcordant = 0; @DataPoint(description = "the concordance rate", format = "%.2f") - double concordantRate = 0.0; + public double concordantRate = 0.0; public int getComparisonOrder() { return 2; // we need to see each eval track and each comp track @@ -51,10 +51,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { novelSites = nNovelSites(); } - public boolean enabled() { - return true; - } - /** * Returns true if every allele in eval is also in comp * @@ -71,7 +67,7 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { return false; } - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { boolean evalIsGood = eval != null && eval.isPolymorphicInSamples(); boolean compIsGood = comp != null && comp.isNotFiltered(); @@ -84,7 +80,5 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { nConcordant++; } } - - return null; // we don't capture any interesting sites } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index e5e8dfaf5..c7392cff0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -11,54 +11,51 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; @Analysis(description = "Counts different classes of variants in the sample") public class CountVariants extends VariantEvaluator implements StandardEval { - // the following fields are in output order: // basic counts on various rates found - @DataPoint(description = "Number of processed loci") + @DataPoint(description = "Number of processed loci", format = "%d") public long nProcessedLoci = 0; - @DataPoint(description = "Number of called loci") + @DataPoint(description = "Number of called loci", format = "%d") public long nCalledLoci = 0; - @DataPoint(description = "Number of reference loci") + @DataPoint(description = "Number of reference loci", format = "%d") public long nRefLoci = 0; - @DataPoint(description = "Number of variant loci") + @DataPoint(description = "Number of variant loci", format = "%d") public long nVariantLoci = 0; // the following two calculations get set in the finalizeEvaluation - @DataPoint(description = "Variants per loci rate") + @DataPoint(description = "Variants per loci rate", format = "%.8f") public double variantRate = 0; - @DataPoint(description = "Number of variants per base") + @DataPoint(description = "Number of variants per base", format = "%.8f") public double variantRatePerBp = 0; - - @DataPoint(description = "Number of snp loci") + @DataPoint(description = "Number of snp loci", format = "%d") public long nSNPs = 0; - @DataPoint(description = "Number of mnp loci") + @DataPoint(description = "Number of mnp loci", format = "%d") public long nMNPs = 0; - @DataPoint(description = "Number of insertions") + @DataPoint(description = "Number of insertions", format = "%d") public long nInsertions = 0; - @DataPoint(description = "Number of deletions") + @DataPoint(description = "Number of deletions", format = "%d") public long nDeletions = 0; - @DataPoint(description = "Number of complex indels") + @DataPoint(description = "Number of complex indels", format = "%d") public long nComplex = 0; - @DataPoint(description = "Number of symbolic events") + @DataPoint(description = "Number of symbolic events", format = "%d") public long nSymbolic = 0; - @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)") + @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)", format = "%d") public long nMixed = 0; - - @DataPoint(description = "Number of no calls loci") + @DataPoint(description = "Number of no calls loci", format = "%d") public long nNoCalls = 0; - @DataPoint(description = "Number of het loci") + @DataPoint(description = "Number of het loci", format = "%d") public long nHets = 0; - @DataPoint(description = "Number of hom ref loci") + @DataPoint(description = "Number of hom ref loci", format = "%d") public long nHomRef = 0; - @DataPoint(description = "Number of hom var loci") + @DataPoint(description = "Number of hom var loci", format = "%d") public long nHomVar = 0; - @DataPoint(description = "Number of singletons") + @DataPoint(description = "Number of singletons", format = "%d") public long nSingletons = 0; - @DataPoint(description = "Number of derived homozygotes") + @DataPoint(description = "Number of derived homozygotes", format = "%d") public long nHomDerived = 0; // calculations that get set in the finalizeEvaluation method @@ -72,8 +69,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public double indelRate = 0; @DataPoint(description = "indel rate per base pair", format = "%.2f") public double indelRatePerBp = 0; - @DataPoint(description = "deletion to insertion ratio", format = "%.2f") - public double deletionInsertionRatio = 0; + @DataPoint(description = "insertion to deletion ratio", format = "%.2f") + public double insertionDeletionRatio = 0; private double perLocusRate(long n) { return rate(n, nProcessedLoci); @@ -83,19 +80,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { return inverseRate(n, nProcessedLoci); } - public boolean enabled() { - return true; - } public int getComparisonOrder() { return 1; // we only need to see each eval track } - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); - } - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { nCalledLoci++; // Note from Eric: @@ -103,7 +93,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've // added in a proxy check for monomorphic status here. // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. - if ( vc1.isMonomorphicInSamples() ) { + if ( getWalker().ignoreAC0Sites() && vc1.isMonomorphicInSamples() ) { nRefLoci++; } else { switch (vc1.getType()) { @@ -113,12 +103,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { case SNP: nVariantLoci++; nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; + if (variantWasSingleton(vc1)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; + if (variantWasSingleton(vc1)) nSingletons++; break; case INDEL: nVariantLoci++; @@ -141,12 +131,9 @@ public class CountVariants extends VariantEvaluator implements StandardEval { } } - String refStr = vc1.getReference().getBaseString().toUpperCase(); - - String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; -// if (aaStr.equals(".")) { -// aaStr = refStr; -// } + // these operations are ordered to ensure that we don't get the base string of the ref unless we need it + final String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; + final String refStr = aaStr != null ? vc1.getReference().getBaseString().toUpperCase() : null; // ref aa alt class // A C A der homozygote @@ -189,11 +176,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval { throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); } } - - return null; // we don't capture any interesting sites } public void finalizeEvaluation() { + nProcessedLoci = getWalker().getnProcessedLoci(); variantRate = perLocusRate(nVariantLoci); variantRatePerBp = perLocusRInverseRate(nVariantLoci); heterozygosity = perLocusRate(nHets); @@ -201,6 +187,6 @@ public class CountVariants extends VariantEvaluator implements StandardEval { hetHomRatio = ratio(nHets, nHomVar); indelRate = perLocusRate(nDeletions + nInsertions + nComplex); indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions + nComplex); - deletionInsertionRatio = ratio(nDeletions, nInsertions); + insertionDeletionRatio = ratio(nInsertions, nDeletions); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java old mode 100755 new mode 100644 index 4f5aeed61..09315db73 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java @@ -5,12 +5,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Molten; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -41,271 +37,67 @@ import java.util.*; * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks") +/** + * a table of sample names to genotype concordance figures + */ +@Analysis(name = "Genotype Concordance Detailed", description = "Determine the genotype concordance between the genotypes in difference tracks, and concordance statistics") public class GenotypeConcordance extends VariantEvaluator { - private static final boolean PRINT_INTERESTING_SITES = true; - protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class); - // a mapping from sample to stats - @DataPoint(description = "the detailed concordance statistics for each sample") - SampleStats detailedStats = null; + @Molten(variableFormat = "%s", valueFormat = "%s") + public final Map map = new TreeMap(); - // a mapping from sample to stats summary - @DataPoint(description = "the simplified concordance statistics for each sample") - SampleSummaryStats simplifiedStats = null; + // concordance counts + private final long[][] truthByCalledGenotypeCounts; - private static final int MAX_MISSED_VALIDATION_DATA = 100; - - private boolean discordantInteresting = false; - - static class FrequencyStats implements TableType { - class Stats { - public Stats(int found, int missed) { nFound = found; nMissed = missed; } - public long nFound = 0; - public long nMissed = 0; - } - public HashMap foundMissedByAC = new HashMap(); - - public Object[] getRowKeys() { - String rows[] = new String[foundMissedByAC.size()]; - int index = 0; - for (int i : foundMissedByAC.keySet()) rows[index++] = "AlleleCount_" + i; - return rows; - } - - public Object[] getColumnKeys() { - return new String[]{"number_found", "number_missing"}; - } - - public String getName() { - return "FrequencyStats"; - } - - public String getCell(int x, int y) { - if (x >= foundMissedByAC.size()) throw new IllegalStateException(x + " is greater than the max index of " + (foundMissedByAC.size()-1)); - if (y == 0) return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nFound); - else return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nMissed); - } - - public void incrementFoundCount(int alleleFreq) { - if (!foundMissedByAC.containsKey(alleleFreq)) - foundMissedByAC.put(alleleFreq,new Stats(1,0)); - else - foundMissedByAC.get(alleleFreq).nFound++; - } - - public void incrementMissedCount(int alleleFreq) { - if (!foundMissedByAC.containsKey(alleleFreq)) - foundMissedByAC.put(alleleFreq,new Stats(0,1)); - else - foundMissedByAC.get(alleleFreq).nMissed++; - } - } - - static class QualityScoreHistograms implements TableType { - final static int NUM_BINS = 20; - final HashMap truePositiveQualityScoreMap = new HashMap(); // A HashMap holds all the quality scores until we are able to bin them appropriately - final HashMap falsePositiveQualityScoreMap = new HashMap(); - final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out - final int falsePositiveHist[] = new int[NUM_BINS]; - final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"}; - - public Object[] getRowKeys() { - return rowKeys; - } - - public Object[] getColumnKeys() { - final String columnKeys[] = new String[NUM_BINS]; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - columnKeys[iii] = "histBin" + iii; - } - return columnKeys; - } - - public String getName() { - return "QualityScoreHistogram"; - } - - public String getCell(int x, int y) { - if( x == 0 ) { - return String.valueOf(truePositiveHist[y]); - } else if ( x == 1 ) { - return String.valueOf(falsePositiveHist[y]); - } else { - throw new ReviewedStingException( "Unknown row in " + getName() + ", row = " + x ); - } - } - - public String toString() { - String returnString = ""; - // output both histogram arrays - returnString += "TP: "; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - returnString += truePositiveHist[iii] + " "; - } - returnString += "\nFP: "; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - returnString += falsePositiveHist[iii] + " "; - } - return returnString; - } - - public void incrValue( final double qual, final boolean isTruePositiveCall ) { - HashMap qualScoreMap; - if( isTruePositiveCall ) { - qualScoreMap = truePositiveQualityScoreMap; - } else { - qualScoreMap = falsePositiveQualityScoreMap; - } - final Integer qualKey = Math.round((float) qual); - if( qualScoreMap.containsKey(qualKey) ) { - qualScoreMap.put(qualKey, qualScoreMap.get(qualKey) + 1); - } else { - qualScoreMap.put(qualKey, 1); - } - } - - public void organizeHistogramTables() { - for( int iii = 0; iii < NUM_BINS; iii++ ) { - truePositiveHist[iii] = 0; - falsePositiveHist[iii] = 0; - } - - int maxQual = 0; - - // Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram - for( final Integer qual : truePositiveQualityScoreMap.keySet()) { - if( qual > maxQual ) { - maxQual = qual; - } - } - for( final Integer qual : falsePositiveQualityScoreMap.keySet()) { - if( qual > maxQual ) { - maxQual = qual; - } - } - - final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); //BUGBUG: should be normalized max to min, not max to 0 - - for( final Integer qual : truePositiveQualityScoreMap.keySet()) { - final int index = (int)Math.floor( ((double)qual) / binSize ); - if(index >= 0) { //BUGBUG: problem when maxQual is zero? - truePositiveHist[ index ] += truePositiveQualityScoreMap.get(qual); - } - } - for( final Integer qual : falsePositiveQualityScoreMap.keySet()) { - final int index = (int)Math.floor( ((double)qual) / binSize ); - if(index >= 0) { - falsePositiveHist[ index ] += falsePositiveQualityScoreMap.get(qual); - } - } - } - } - - // keep a list of the validation data we saw before the first eval data - private HashSet missedValidationData = new HashSet(); - - - //public GenotypeConcordance(VariantEvalWalker parent) { - // super(parent); - // discordantInteresting = parent.DISCORDANT_INTERESTING; - //} - - public String getName() { - return "genotypeConcordance"; + /** + * Initialize this object + */ + public GenotypeConcordance() { + final int nGenotypeTypes = Genotype.Type.values().length; + truthByCalledGenotypeCounts = new long[nGenotypeTypes][nGenotypeTypes]; } + @Override public int getComparisonOrder() { - return 2; // we need to see each eval track and each comp track + return 2; } - public boolean enabled() { - return true; - } - - public String toString() { - return getName() + ":
"; - } - - private boolean warnedAboutValidationData = false; - - public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - String interesting = null; - + @Override + public void update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { // sanity check that we at least have either eval or validation data if ( (validation != null && !validation.hasGenotypes()) || eval == null && !isValidVC(validation)) { - return interesting; - } + return; + } else { + final boolean validationIsValidVC = isValidVC(validation); - if (detailedStats == null) { + // determine concordance for eval data if (eval != null) { - // initialize the concordance table - detailedStats = new SampleStats(eval,Genotype.Type.values().length); - simplifiedStats = new SampleSummaryStats(eval); - for (final VariantContext vc : missedValidationData) { - determineStats(null, vc); - } - missedValidationData = null; - } else { - // todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide - // todo -- perhaps you need should extend the evaluators with an initialize - // todo -- method that gets the header (or samples) for the first eval sites? - if (missedValidationData.size() > MAX_MISSED_VALIDATION_DATA) { - if (!warnedAboutValidationData) { - //logger.warn("Too many genotype sites missed before eval site appeared; ignoring"); - warnedAboutValidationData = true; + for (final Genotype g : eval.getGenotypes() ) { + final String sample = g.getSampleName(); + final Genotype.Type called = g.getType(); + final Genotype.Type truth; + + if (!validationIsValidVC || !validation.hasGenotype(sample)) { + truth = Genotype.Type.NO_CALL; + } else { + truth = validation.getGenotype(sample).getType(); } - } else { - missedValidationData.add(validation); + + incrValue(truth, called); } - return interesting; } - } - interesting = determineStats(eval, validation); + // otherwise, mark no-calls for all samples + else { + final Genotype.Type called = Genotype.Type.NO_CALL; - return interesting; // we don't capture any interesting sites - } + for (final Genotype g : validation.getGenotypes()) { + final Genotype.Type truth = g.getType(); + incrValue(truth, called); - private String determineStats(final VariantContext eval, final VariantContext validation) { - String interesting = null; - - final boolean validationIsValidVC = isValidVC(validation); - final String evalAC = ( vcHasGoodAC(eval) ) ? String.format("evalAC%d",getAC(eval)) : null ; - final String validationAC = ( vcHasGoodAC(validation) ) ? String.format("compAC%d",getAC(validation)) : null; - - // determine concordance for eval data - if (eval != null) { - for (final Genotype g : eval.getGenotypes() ) { - final String sample = g.getSampleName(); - final Genotype.Type called = g.getType(); - final Genotype.Type truth; - - if (!validationIsValidVC || !validation.hasGenotype(sample)) { - truth = Genotype.Type.NO_CALL; - } else { - truth = validation.getGenotype(sample).getType(); - // interesting = "ConcordanceStatus=FP"; - if (discordantInteresting && truth.ordinal() != called.ordinal()) - { - interesting = "ConcordanceStatus=" + truth + "/" + called; - } - } - - detailedStats.incrValue(sample, truth, called); - } - } - // otherwise, mark no-calls for all samples - else { - final Genotype.Type called = Genotype.Type.NO_CALL; - - for (final Genotype g : validation.getGenotypes()) { - final Genotype.Type truth = g.getType(); - detailedStats.incrValue(g.getSampleName(), truth, called); - - // print out interesting sites - /* + // print out interesting sites + /* if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) { if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) { super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation); @@ -315,292 +107,120 @@ public class GenotypeConcordance extends VariantEvaluator { } } */ + } } } - - return interesting; } private static boolean isValidVC(final VariantContext vc) { return (vc != null && !vc.isFiltered()); } - public void finalizeEvaluation() { - if( simplifiedStats != null && detailedStats != null ) { - simplifiedStats.generateSampleSummaryStats(detailedStats); - } - } - - private boolean vcHasGoodAC(VariantContext vc) { - return ( vc != null && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ); - - } - - private int getAC(VariantContext vc) { - if ( List.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass()) ) { - return ((List) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0); - } else if ( Integer.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass())) { - return (Integer) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); - } else if ( String.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass()) ) { - // two ways of parsing - String ac = (String) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); - if ( ac.startsWith("[") ) { - return Integer.parseInt(ac.replaceAll("\\[","").replaceAll("\\]","")); - } else { - try { - return Integer.parseInt(ac); - } catch ( NumberFormatException e ) { - throw new UserException(String.format("The format of the AC field is improperly formatted: AC=%s",ac)); - } - } - } else { - throw new UserException(String.format("The format of the AC field does not appear to be of integer-list or String format, class was %s",vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass())); - } - } -} - -/** - * a table of sample names to genotype concordance figures - */ -class SampleStats implements TableType { - private final int nGenotypeTypes; - - // sample to concordance stats object - public final HashMap concordanceStats = new HashMap(); - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return concordanceStats.keySet().toArray(new String[concordanceStats.size()]); - } - /** * increment the specified value - * @param sample the sample name * @param truth the truth type * @param called the called type */ - public void incrValue(String sample, Genotype.Type truth, Genotype.Type called) { - if ( concordanceStats.containsKey(sample) ) - concordanceStats.get(sample)[truth.ordinal()][called.ordinal()]++; - else if ( called != Genotype.Type.NO_CALL ) - throw new UserException.CommandLineException("Sample " + sample + " has not been seen in a previous eval; this analysis module assumes that all samples are present in each variant context"); + private void incrValue(final Genotype.Type truth, final Genotype.Type called) { + truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]++; } - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { -// return new String[]{"total_true_ref","%_ref/ref","n_ref/no-call", -// "n_ref/ref","n_ref/het","n_ref/hom", -// "total_true_het","%_het/het","n_het/no-call", -// "n_het/ref","n_het/het","n_het/hom", -// "total_true_hom","%_hom/hom","n_hom/no-call", -// "n_hom/ref","n_hom/het","n_hom/hom"}; - return new String[]{"total_true_ref","pct_ref_vs_ref","n_ref_vs_no_call", - "n_ref_vs_ref","n_ref_vs_het","n_ref_vs_hom", - "total_true_het","pct_het_vs_het","n_het_vs_no_call", - "n_het_vs_ref","n_het_vs_het","n_het_vs_hom", - "total_true_hom","pct_hom_vs_hom","n_hom_vs_no_call", - "n_hom_vs_ref","n_hom_vs_het","n_hom_vs_hom"}; + private long count(final Genotype.Type truth, final Genotype.Type called) { + return truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]; } - - public SampleStats(VariantContext vc, int nGenotypeTypes) { - this.nGenotypeTypes = nGenotypeTypes; - for (final Genotype g : vc.getGenotypes()) - concordanceStats.put(g.getSampleName(), new long[nGenotypeTypes][nGenotypeTypes]); + private long count(final EnumSet truth, final Genotype.Type called) { + return count(truth, EnumSet.of(called)); } - public SampleStats(int genotypeTypes) { - nGenotypeTypes = genotypeTypes; + private long count(final Genotype.Type truth, final EnumSet called) { + return count(EnumSet.of(truth), called); } - public Object getCell(int x, int y) { - // we have three rows of 6 right now for output (rows: ref, het, hom) - Genotype.Type type = Genotype.Type.values()[(y/6)+1]; // get the row type - // save some repeat work, get the total every time - long total = 0; - Object[] rowKeys = getRowKeys(); - for (int called = 0; called < nGenotypeTypes; called++) { - total += concordanceStats.get(rowKeys[x])[type.ordinal()][called]; - } - - // now get the cell they're interested in - switch (y % 6) { - case (0): // get the total_true for this type - return total; - case (1): - return total == 0 ? 0.0 : (100.0 * (double) concordanceStats.get(rowKeys[x])[type.ordinal()][type.ordinal()] / (double) total); - default: - return concordanceStats.get(rowKeys[x])[type.ordinal()][(y % 6) - 2]; + private long count(final EnumSet truth, final EnumSet called) { + long sum = 0; + for ( final Genotype.Type truth1 : truth ) { + for ( final Genotype.Type called1 : called ) { + sum += count(truth1, called1); + } } + return sum; } - public String getName() { - return "Sample Statistics"; - } -} - -/** - * a table of sample names to genotype concordance summary statistics - */ -class SampleSummaryStats implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS = new String[]{ - "percent_comp_ref_called_ref", - "percent_comp_het_called_het", - "percent_comp_hom_called_hom", - "percent_non_reference_sensitivity", - "percent_overall_genotype_concordance", - "percent_non_reference_discrepancy_rate"}; - - // sample to concordance stats object - protected final HashMap concordanceSummary = new HashMap(); - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return concordanceSummary.keySet().toArray(new String[concordanceSummary.size()]); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return COLUMN_KEYS; - } - - public SampleSummaryStats(final VariantContext vc) { - concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( final Genotype g : vc.getGenotypes() ) { - concordanceSummary.put(g.getSampleName(), new double[COLUMN_KEYS.length]); - } - } - - public SampleSummaryStats() { - - } - - public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - return String.format("%.2f",concordanceSummary.get(rowKeys[x])[y]); - } - - /** - * Helper routine that sums up all columns / rows found in stats specified by all pairs in d1 x d2 - * - * @param stats - * @param d1 - * @param d2 - * @return - */ - private long sumStatsAllPairs( final long[][] stats, EnumSet d1, EnumSet d2 ) { - long sum = 0L; + private long countDiag( final EnumSet d1 ) { + long sum = 0; for(final Genotype.Type e1 : d1 ) { - for(final Genotype.Type e2 : d2 ) { - sum += stats[e1.ordinal()][e2.ordinal()]; + sum += truthByCalledGenotypeCounts[e1.ordinal()][e1.ordinal()]; + } + + return sum; + } + + @Override + public void finalizeEvaluation() { + final EnumSet allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET); + final EnumSet allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF); + final EnumSet allGenotypes = EnumSet.allOf(Genotype.Type.class); + + // exact values of the table + for ( final Genotype.Type truth : Genotype.Type.values() ) { + for ( final Genotype.Type called : Genotype.Type.values() ) { + final String field = String.format("n_true_%s_called_%s", truth, called); + final Long value = count(truth, called); + map.put(field, value.toString()); } } - return sum; - } - - private long sumStatsDiag( final long[][] stats, EnumSet d1) { - long sum = 0L; - - for(final Genotype.Type e1 : d1 ) { - sum += stats[e1.ordinal()][e1.ordinal()]; + // counts of called genotypes + for ( final Genotype.Type called : Genotype.Type.values() ) { + final String field = String.format("total_called_%s", called); + final Long value = count(allGenotypes, called); + map.put(field, value.toString()); } - return sum; - } + // counts of true genotypes + for ( final Genotype.Type truth : Genotype.Type.values() ) { + final String field = String.format("total_true_%s", truth); + final Long value = count(truth, allGenotypes); + map.put(field, value.toString()); + } - private double ratio(long numer, long denom) { - return denom != 0L ? 100.0 * ( ((double)numer) / ((double)denom) ) : 0.0; - } + for ( final Genotype.Type genotype : Genotype.Type.values() ) { + final String field = String.format("percent_%s_called_%s", genotype, genotype); + long numer = count(genotype, genotype); + long denom = count(EnumSet.of(genotype), allGenotypes); + map.put(field, Utils.formattedPercent(numer, denom)); + } - final long[] allSamplesNumerators = new long[COLUMN_KEYS.length]; - final long[] allSamplesDenominators = new long[COLUMN_KEYS.length]; - - private void updateSummaries(int i, double[] summary, long numer, long denom ) { - allSamplesNumerators[i] += numer; - allSamplesDenominators[i] += denom; - summary[i] = ratio(numer, denom); - } - - - /** - * Calculate the five summary stats per sample - * @param sampleStats The Map which holds concordance values per sample - */ - public void generateSampleSummaryStats( final SampleStats sampleStats ) { - EnumSet allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET); - EnumSet allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF); - EnumSet allGenotypes = EnumSet.allOf(Genotype.Type.class); - - for( final String sample : concordanceSummary.keySet() ) { - if ( sample.equals(ALL_SAMPLES_KEY) ) continue; - - final long[][] stats = sampleStats.concordanceStats.get(sample); - final double[] summary = concordanceSummary.get(sample); - if( stats == null ) { throw new ReviewedStingException( "SampleStats and SampleSummaryStats contain different samples! sample = " + sample ); } - - long numer, denom; - - // Summary 0: % ref called as ref - numer = stats[Genotype.Type.HOM_REF.ordinal()][Genotype.Type.HOM_REF.ordinal()]; - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_REF), allGenotypes); - updateSummaries(0, summary, numer, denom); - - // Summary 1: % het called as het - numer = stats[Genotype.Type.HET.ordinal()][Genotype.Type.HET.ordinal()]; - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allGenotypes); - updateSummaries(1, summary, numer, denom); - - // Summary 2: % homVar called as homVar - numer = stats[Genotype.Type.HOM_VAR.ordinal()][Genotype.Type.HOM_VAR.ordinal()]; - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allGenotypes); - updateSummaries(2, summary, numer, denom); - - // Summary 3: % non-ref called as non-ref + { + // % non-ref called as non-ref // MAD: this is known as the non-reference sensitivity (# non-ref according to comp found in eval / # non-ref in comp) - numer = sumStatsAllPairs(stats, allVariantGenotypes, allVariantGenotypes); - denom = sumStatsAllPairs(stats, allVariantGenotypes, allGenotypes); - updateSummaries(3, summary, numer, denom); + final String field = "percent_non_reference_sensitivity"; + long numer = count(allVariantGenotypes, allVariantGenotypes); + long denom = count(allVariantGenotypes, allGenotypes); + map.put(field, Utils.formattedPercent(numer, denom)); + } - // Summary 4: overall genotype concordance of sites called in eval track + { + // overall genotype concordance of sites called in eval track // MAD: this is the tradition genotype concordance - numer = sumStatsDiag(stats, allCalledGenotypes); - denom = sumStatsAllPairs(stats, allCalledGenotypes, allCalledGenotypes); - updateSummaries(4, summary, numer, denom); - - // Summary 5: overall genotype concordance of sites called non-ref in eval track - long homrefConcords = stats[Genotype.Type.HOM_REF.ordinal()][Genotype.Type.HOM_REF.ordinal()]; - long diag = sumStatsDiag(stats, allVariantGenotypes); - long allNoHomRef = sumStatsAllPairs(stats, allCalledGenotypes, allCalledGenotypes) - homrefConcords; - numer = allNoHomRef - diag; - denom = allNoHomRef; - updateSummaries(5, summary, numer, denom); + final String field = "percent_overall_genotype_concordance"; + long numer = countDiag(allCalledGenotypes); + long denom = count(allCalledGenotypes, allCalledGenotypes); + map.put(field, Utils.formattedPercent(numer, denom)); } - // update the final summary stats - final double[] allSamplesSummary = concordanceSummary.get(ALL_SAMPLES_KEY); - for ( int i = 0; i < allSamplesSummary.length; i++) { - allSamplesSummary[i] = ratio(allSamplesNumerators[i], allSamplesDenominators[i]); + { + // overall genotype concordance of sites called non-ref in eval track + // MAD: this is the non-reference discrepancy rate + final String field = "percent_non_reference_discrepancy_rate"; + long homrefConcords = count(Genotype.Type.HOM_REF, Genotype.Type.HOM_REF); + long allNoHomRef = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords; + long numer = allNoHomRef - countDiag(allVariantGenotypes); + long denom = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords; + map.put(field, Utils.formattedPercent(numer, denom)); } - - } - - public String getName() { - return "Sample Summary Statistics"; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java deleted file mode 100755 index f4369401b..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ /dev/null @@ -1,433 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.phasing.AllelePair; -import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.NewEvaluationContext; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -@Analysis(name = "Genotype Phasing Evaluation", description = "Evaluates the phasing of genotypes in different tracks") -public class GenotypePhasingEvaluator extends VariantEvaluator { - protected final static Logger logger = Logger.getLogger(GenotypePhasingEvaluator.class); - - // a mapping from sample to stats - @DataPoint(description = "the phasing statistics for each sample") - SamplePhasingStatistics samplePhasingStatistics = null; - - SamplePreviousGenotypes samplePrevGenotypes = null; - - double minPhaseQuality = 10.0; - - public void initialize(VariantEvalWalker walker) { - this.samplePhasingStatistics = new SamplePhasingStatistics(walker.getMinPhaseQuality()); - this.samplePrevGenotypes = new SamplePreviousGenotypes(); - } - - public String getName() { - return "GenotypePhasingEvaluator"; - } - - public int getComparisonOrder() { - return 2; // we only need to see pairs of (comp, eval) - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName() + ":
"; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return update2(eval,comp,tracker,ref,context,null); - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, NewEvaluationContext group) { - //public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - Reasons interesting = new Reasons(); - if (ref == null) - return interesting.toString(); - GenomeLoc curLocus = ref.getLocus(); - - logger.debug("update2() locus: " + curLocus); - logger.debug("comp = " + comp + " eval = " + eval); - - Set allSamples = new HashSet(); - - GenotypesContext compSampGenotypes = null; - if (isRelevantToPhasing(comp)) { - allSamples.addAll(comp.getSampleNames()); - compSampGenotypes = comp.getGenotypes(); - } - - GenotypesContext evalSampGenotypes = null; - if (isRelevantToPhasing(eval)) { - allSamples.addAll(eval.getSampleNames()); - evalSampGenotypes = eval.getGenotypes(); - } - - for (String samp : allSamples) { - logger.debug("sample = " + samp); - - Genotype compSampGt = null; - if (compSampGenotypes != null) - compSampGt = compSampGenotypes.get(samp); - - Genotype evalSampGt = null; - if (evalSampGenotypes != null) - evalSampGt = evalSampGenotypes.get(samp); - - if (compSampGt == null || evalSampGt == null || compSampGt.isNoCall() || evalSampGt.isNoCall()) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] - // Having an unphased site breaks the phasing for the sample [does NOT permit "transitive phasing"] - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: - if (isNonNullButUnphased(compSampGt) || isNonNullButUnphased(evalSampGt)) - samplePrevGenotypes.put(samp, null); - } - else { // Both comp and eval have a non-null Genotype at this site: - AllelePair compAllelePair = new AllelePair(compSampGt); - AllelePair evalAllelePair = new AllelePair(evalSampGt); - - boolean breakPhasing = false; - if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom()) - breakPhasing = true; // since they are not both het or both hom - else { // both are het, or both are hom: - boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair)); - boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair)); - if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop) - breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample - } - - if (breakPhasing) { - samplePrevGenotypes.put(samp, null); // nothing to do for this site, AND must remove any history for the future - } - else if (compSampGt.isHet() && evalSampGt.isHet()) { - /* comp and eval have the HET same Genotype at this site: - [Note that if both are hom, then nothing is done here, but the het history IS preserved]. - */ - CompEvalGenotypes prevCompAndEval = samplePrevGenotypes.get(samp); - if (prevCompAndEval != null && !prevCompAndEval.getLocus().onSameContig(curLocus)) // exclude curLocus if it is "phased" relative to a different chromosome - prevCompAndEval = null; - - // Replace the previous hets with the current hets: - samplePrevGenotypes.put(samp, curLocus, compSampGt, evalSampGt); - - if (prevCompAndEval != null) { - GenomeLoc prevLocus = prevCompAndEval.getLocus(); - logger.debug("Potentially phaseable het locus: " + curLocus + " [relative to previous het locus: " + prevLocus + "]"); - PhaseStats ps = samplePhasingStatistics.ensureSampleStats(samp); - - boolean compSampIsPhased = genotypesArePhasedAboveThreshold(compSampGt); - boolean evalSampIsPhased = genotypesArePhasedAboveThreshold(evalSampGt); - if (compSampIsPhased || evalSampIsPhased) { - if (!evalSampIsPhased) { - ps.onlyCompPhased++; - //interesting.addReason("ONLY_COMP", samp, group, prevLocus, ""); - } - else if (!compSampIsPhased) { - ps.onlyEvalPhased++; - //interesting.addReason("ONLY_EVAL", samp, group, prevLocus, ""); - } - else { // both comp and eval are phased: - AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye()); - AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype()); - - // Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample: - boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair)); - boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair)); - - if (topsMatch || topMatchesBottom) { - ps.phasesAgree++; - - Double compPQ = getPQ(compSampGt); - Double evalPQ = getPQ(evalSampGt); - if (compPQ != null && evalPQ != null && MathUtils.compareDoubles(compPQ, evalPQ) != 0) { - //interesting.addReason("PQ_CHANGE", samp, group, prevLocus, compPQ + " -> " + evalPQ); - } - } - else { - ps.phasesDisagree++; - logger.debug("SWITCHED locus: " + curLocus); - //interesting.addReason("SWITCH", samp, group, prevLocus, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair)); - } - } - } - else { - ps.neitherPhased++; - } - } - } - } - } - logger.debug("\n" + samplePhasingStatistics + "\n"); - - return interesting.toString(); - } - - public static boolean isRelevantToPhasing(VariantContext vc) { - return (vc != null && !vc.isFiltered()); - } - - public boolean isNonNullButUnphased(Genotype gt) { - return (gt != null && !gt.isNoCall() && !genotypesArePhasedAboveThreshold(gt)); - } - - public boolean genotypesArePhasedAboveThreshold(Genotype gt) { - if (gt.isHom()) // Can always consider a hom site to be phased to its predecessor, since its successor will only be phased to it if it's hom or "truly" phased - return true; - - if (!gt.isPhased()) - return false; - - Double pq = getPQ(gt); - return (pq == null || pq >= minPhaseQuality); - } - - public static Double getPQ(Genotype gt) { - Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); - return d == -1 ? null : d; - } - - public static boolean topMatchesTop(AllelePair b1, AllelePair b2) { - return b1.getTopAllele().equals(b2.getTopAllele()); - } - - public static boolean topMatchesBottom(AllelePair b1, AllelePair b2) { - return b1.getTopAllele().equals(b2.getBottomAllele()); - } - - public static boolean bottomMatchesTop(AllelePair b1, AllelePair b2) { - return topMatchesBottom(b2, b1); - } - - public static boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) { - return b1.getBottomAllele().equals(b2.getBottomAllele()); - } - - public String toString(AllelePair prev, AllelePair cur) { - return prev.getTopAllele().getBaseString() + "+" + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "+" + cur.getBottomAllele().getBaseString(); - } - - public void finalizeEvaluation() { - } - - private static class Reasons { - private StringBuilder sb; - - public Reasons() { - sb = new StringBuilder(); - } - -// public void addReason(String category, String sample, VariantEvalWalker.EvaluationContext evalGroup, GenomeLoc prevLoc, String reason) { -// sb.append(category + "(" + sample + ", previous: " + prevLoc + " [" + evalGroup.compTrackName + ", " + evalGroup.evalTrackName + "]): " + reason + ";"); -// } - - public String toString() { - if (sb.length() == 0) - return null; - - return "reasons=" + sb.toString(); - } - } -} - - - -class CompEvalGenotypes { - private GenomeLoc loc; - private Genotype compGt; - private Genotype evalGt; - - public CompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { - this.loc = loc; - this.compGt = compGt; - this.evalGt = evalGt; - } - - public GenomeLoc getLocus() { - return loc; - } - - public Genotype getCompGenotpye() { - return compGt; - } - public Genotype getEvalGenotype() { - return evalGt; - } - - public void setCompGenotype(Genotype compGt) { - this.compGt = compGt; - } - - public void setEvalGenotype(Genotype evalGt) { - this.evalGt = evalGt; - } -} - -class SamplePreviousGenotypes { - private HashMap sampleGenotypes = null; - - public SamplePreviousGenotypes() { - this.sampleGenotypes = new HashMap(); - } - - public CompEvalGenotypes get(String sample) { - return sampleGenotypes.get(sample); - } - - public void put(String sample, CompEvalGenotypes compEvalGts) { - sampleGenotypes.put(sample, compEvalGts); - } - - public void put(String sample, GenomeLoc locus, Genotype compGt, Genotype evalGt) { - sampleGenotypes.put(sample, new CompEvalGenotypes(locus, compGt, evalGt)); - } -} - -class PhaseStats { - public int neitherPhased; - public int onlyCompPhased; - public int onlyEvalPhased; - public int phasesAgree; - public int phasesDisagree; - - public PhaseStats() { - this.neitherPhased = 0; - this.onlyCompPhased = 0; - this.onlyEvalPhased = 0; - this.phasesAgree = 0; - this.phasesDisagree = 0; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("Neither phased: " + neitherPhased + "\tOnly Comp: " + onlyCompPhased + "\tOnly Eval: " + onlyEvalPhased + "\tSame phase: " + phasesAgree + "\tOpposite phase: " + phasesDisagree); - return sb.toString(); - } - - public static String[] getFieldNamesArray() { - return new String[]{"total", "neither", "only_comp", "only_eval", "both", "match", "switch", "switch_rate"}; - } - - public Object getField(int index) { - switch (index) { - case (0): - return (neitherPhased + onlyCompPhased + onlyEvalPhased + phasesAgree + phasesDisagree); - case (1): - return neitherPhased; - case (2): - return onlyCompPhased; - case (3): - return onlyEvalPhased; - case (4): - return (phasesAgree + phasesDisagree); - case (5): - return phasesAgree; - case (6): - return phasesDisagree; - case (7): - return ((phasesDisagree == 0) ? 0 : ((double) phasesDisagree) / (phasesAgree + phasesDisagree)); - default: - return -1; - } - } -} - -/** - * a table of sample names to genotype phasing statistics - */ -class SamplePhasingStatistics implements TableType { - private HashMap sampleStats = null; - private double minPhaseQuality; - - public SamplePhasingStatistics(double minPhaseQuality) { - this.sampleStats = new HashMap(); - this.minPhaseQuality = minPhaseQuality; - } - - public PhaseStats ensureSampleStats(String samp) { - PhaseStats ps = sampleStats.get(samp); - if (ps == null) { - ps = new PhaseStats(); - sampleStats.put(samp, ps); - } - return ps; - } - - /** - * @return one row per sample - */ - public String[] getRowKeys() { - return sampleStats.keySet().toArray(new String[sampleStats.size()]); - } - - /** - * get the column keys - * - * @return a list of objects, in this case strings, that are the column names - */ - public String[] getColumnKeys() { - return PhaseStats.getFieldNamesArray(); - } - - public Object getCell(int x, int y) { - String[] rowKeys = getRowKeys(); - PhaseStats ps = sampleStats.get(rowKeys[x]); - return ps.getField(y); - } - - public String getName() { - return "Sample Phasing Statistics (for PQ >= " + minPhaseQuality + ")"; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Map.Entry sampPhaseStatsEnt : sampleStats.entrySet()) { - String sample = sampPhaseStatsEnt.getKey(); - PhaseStats ps = sampPhaseStatsEnt.getValue(); - - sb.append(sample + "\t" + ps); - } - return sb.toString(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java old mode 100755 new mode 100644 index 6cf8b7c2c..0b17c7adb --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -1,111 +1,122 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Molten; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.*; + /** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * Simple utility for histogramming indel lengths * - * @Author chartl - * @Date May 26, 2010 + * Based on code from chartl + * + * @author Mark DePristo + * @since 3/21/12 */ -@Analysis(name = "Indel length histograms", description = "Shows the distribution of insertion/deletion event lengths (negative for deletion, positive for insertion)") -public class IndelLengthHistogram extends VariantEvaluator { - private static final int SIZE_LIMIT = 100; - @DataPoint(description="Histogram of indel lengths") - IndelHistogram indelHistogram = new IndelHistogram(SIZE_LIMIT); +@Analysis(description = "Indel length histogram", molten = true) +public class IndelLengthHistogram extends VariantEvaluator implements StandardEval { + private final Map counts = new HashMap(); + private final static boolean asFrequencies = true; + int nIndels = 0; - /* - * Indel length histogram table object - */ + @Molten(variableName = "Length", valueName = "Freq", variableFormat = "%d", valueFormat = "%.2f") + public TreeMap results; + + public final static int MAX_SIZE_FOR_HISTOGRAM = 10; + private final static boolean INCLUDE_LONG_EVENTS_AT_MAX_SIZE = false; - static class IndelHistogram implements TableType { - private Integer[] colKeys; - private int limit; - private String[] rowKeys = {"EventLength"}; - private Integer[] indelHistogram; + public IndelLengthHistogram() { + initializeCounts(MAX_SIZE_FOR_HISTOGRAM); + } - public IndelHistogram(int limit) { - colKeys = initColKeys(limit); - indelHistogram = initHistogram(limit); - this.limit = limit; - } - - public Object[] getColumnKeys() { - return colKeys; - } - - public Object[] getRowKeys() { - return rowKeys; - } - - public Object getCell(int row, int col) { - return indelHistogram[col]; - } - - private Integer[] initColKeys(int size) { - Integer[] cK = new Integer[size*2+1]; - int index = 0; - for ( int i = -size; i <= size; i ++ ) { - cK[index] = i; - index++; - } - - return cK; - } - - private Integer[] initHistogram(int size) { - Integer[] hist = new Integer[size*2+1]; - for ( int i = 0; i < 2*size+1; i ++ ) { - hist[i] = 0; - } - - return hist; - } - - public String getName() { return "indelHistTable"; } - - public void update(int eLength) { - indelHistogram[len2index(eLength)]++; - } - - private int len2index(int len) { - if ( len > limit || len < -limit ) { - throw new ReviewedStingException("Indel length exceeds limit of "+limit+" please increase indel limit size"); - } - return len + limit; + private void initializeCounts(int size) { + for ( int i = -size; i <= size; i++ ) { + if ( i != 0 ) counts.put(i, 0); } } - public boolean enabled() { return true; } - - public String getName() { return "IndelLengthHistogram"; } - - public int getComparisonOrder() { return 1; } // need only the evals - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( vc1.isIndel() && vc1.isPolymorphicInSamples() ) { - - if ( ! vc1.isBiallelic() ) { - //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); - return vc1.toString(); // biallelic sites are output + @Override + public void finalizeEvaluation() { + if ( asFrequencies ) { + results = new TreeMap(); + for ( final int len : counts.keySet() ) { + final double value = nIndels == 0 ? 0.0 : counts.get(len) / (1.0 * nIndels); + results.put(len, value); } + } else { + results = new TreeMap(results); + } + } - // only count simple insertions/deletions, not complex indels - if ( vc1.isSimpleInsertion() ) { - indelHistogram.update(vc1.getAlternateAllele(0).length()); - } else if ( vc1.isSimpleDeletion() ) { - indelHistogram.update(-vc1.getReference().length()); + @Override + public int getComparisonOrder() { + return 1; + } + + @Override + public void update1(final VariantContext eval, final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( eval.isIndel() && ! eval.isComplexIndel() ) { + if ( ! ( getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples() )) { + // only if we are actually polymorphic in the subsetted samples should we count the allele + for ( Allele alt : eval.getAlternateAlleles() ) { + final int alleleSize = alt.length() - eval.getReference().length(); + if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); + updateLengthHistogram(eval.getReference(), alt); + } } } + } - return null; + /** + * Update the histogram with the implied length of the indel allele between ref and alt (alt.len - ref.len). + * + * If this size is outside of MAX_SIZE_FOR_HISTOGRAM, the size is capped to MAX_SIZE_FOR_HISTOGRAM, + * if INCLUDE_LONG_EVENTS_AT_MAX_SIZE is set. + * + * @param ref + * @param alt + */ + public void updateLengthHistogram(final Allele ref, final Allele alt) { + int len = alt.length() - ref.length(); + if ( INCLUDE_LONG_EVENTS_AT_MAX_SIZE ) { + if ( len > MAX_SIZE_FOR_HISTOGRAM ) len = MAX_SIZE_FOR_HISTOGRAM; + if ( len < -MAX_SIZE_FOR_HISTOGRAM ) len = -MAX_SIZE_FOR_HISTOGRAM; + } + + if ( Math.abs(len) > MAX_SIZE_FOR_HISTOGRAM ) + return; + + nIndels++; + counts.put(len, counts.get(len) + 1); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java deleted file mode 100755 index 87b453ae3..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ /dev/null @@ -1,295 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.IndelUtils; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -@Analysis(name = "IndelStatistics", description = "Shows various indel metrics and statistics") -public class IndelStatistics extends VariantEvaluator { - @DataPoint(description = "Indel Statistics") - IndelStats indelStats = null; - - // @DataPoint(description = "Indel Classification") - IndelClasses indelClasses = null; - - int numSamples = 0; - - public void initialize(VariantEvalWalker walker) { - numSamples = walker.getNumSamples(); - } - - private static final int INDEL_SIZE_LIMIT = 100; - private static final int IND_HET = 0; - private static final int IND_INS = 1; - private static final int IND_DEL = 2; - private static final int IND_COMPLEX = 3; - private static final int IND_HET_INS = 4; - private static final int IND_HOM_INS = 5; - private static final int IND_HET_DEL = 6; - private static final int IND_HOM_DEL = 7; - private static final int IND_HOM_REF = 8; - private static final int IND_MIXED = 9; - private static final int IND_LONG = 10; - private static final int IND_AT_EXP = 11; - private static final int IND_CG_EXP = 12; - private static final int IND_FRAMESHIFT = 13; - private static final int NUM_SCALAR_COLUMNS = 14; - - static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; - } - - static int index2len(int ind) { - return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; - } - - static class IndelStats implements TableType { - protected final static String[] COLUMN_KEYS; - - static { - COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; - COLUMN_KEYS[0] = "heterozygosity"; - COLUMN_KEYS[1] = "insertions"; - COLUMN_KEYS[2] = "deletions"; - COLUMN_KEYS[3] = "complex"; - COLUMN_KEYS[4] = "het_insertions"; - COLUMN_KEYS[5] = "homozygous_insertions"; - COLUMN_KEYS[6] = "het_deletions"; - COLUMN_KEYS[7] = "homozygous_deletions"; - COLUMN_KEYS[8] = "homozygous_reference_sites"; - COLUMN_KEYS[9] = "complex_events"; - COLUMN_KEYS[10] = "long_indels"; - COLUMN_KEYS[11] = "AT_expansions"; - COLUMN_KEYS[12] = "CG_expansions"; - COLUMN_KEYS[13] = "frameshift_indels"; - - for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) - COLUMN_KEYS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); - } - - // map of sample to statistics - protected final int[] indelSummary; - - public IndelStats(final VariantContext vc) { - indelSummary = new int[COLUMN_KEYS.length]; - } - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return new String[]{"all"}; - } - public Object getCell(int x, int y) { - return String.format("%d",indelSummary[y]); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return COLUMN_KEYS; - } - - public String getName() { - return "IndelStats"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public String toString() { - return getName(); - } - - /* - * increment the specified value - */ - public void incrValue(VariantContext vc, ReferenceContext ref) { - int eventLength = 0; - boolean isInsertion = false, isDeletion = false; - - if ( vc.isSimpleInsertion() ) { - eventLength = vc.getAlternateAllele(0).length(); - indelSummary[IND_INS]++; - isInsertion = true; - } else if ( vc.isSimpleDeletion() ) { - indelSummary[IND_DEL]++; - eventLength = -vc.getReference().length(); - isDeletion = true; - } - else if (vc.isComplexIndel()) { - indelSummary[IND_COMPLEX]++; - } - else if (vc.isMixed()) - indelSummary[IND_MIXED]++; - - if (IndelUtils.isATExpansion(vc,ref)) - indelSummary[IND_AT_EXP]++; - if (IndelUtils.isCGExpansion(vc,ref)) - indelSummary[IND_CG_EXP]++; - - // make sure event doesn't overstep array boundaries - if (vc.isSimpleDeletion() || vc.isSimpleInsertion()) { - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { - indelSummary[len2Index(eventLength)]++; - if (eventLength % 3 != 0) - indelSummary[IND_FRAMESHIFT]++; - } - else - indelSummary[IND_LONG]++; - } - - } - } - - static class IndelClasses implements TableType { - protected final static String[] columnNames = IndelUtils.getIndelClassificationNames(); - - - // map of sample to statistics - protected final int[] indelClassSummary; - - public IndelClasses(final VariantContext vc) { - indelClassSummary = new int[columnNames.length]; - } - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return new String[]{"all"}; - } - public Object getCell(int x, int y) { - return String.format("%d",indelClassSummary[y]); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return columnNames; - } - - public String getName() { - return "IndelClasses"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public String toString() { - return getName(); - } - - private void incrementSampleStat(VariantContext vc, int index) { - indelClassSummary[index]++; - } - /* - * increment the specified value - */ - public void incrValue(VariantContext vc, ReferenceContext ref) { - - - ArrayList indices = IndelUtils.findEventClassificationIndex(vc,ref); - //System.out.format("pos:%d \nREF: %s, ALT: %s\n",vc.getStart(), vc.getReference().getDisplayString(), - // vc.getAlternateAllele(0).getDisplayString()); - - byte[] refBases = ref.getBases(); - //System.out.format("ref bef:%s\n",new String(Arrays.copyOfRange(refBases,0,refBases.length/2+1) )); - //System.out.format("ref aft:%s\n",new String(Arrays.copyOfRange(refBases,refBases.length/2+1,refBases.length) )); - for (int index: indices) { - incrementSampleStat(vc, index); - // System.out.println(IndelUtils.getIndelClassificationName(index)); - } - } - - } - - //public IndelStatistics(VariantEvalWalker parent) { - //super(parent); - // don't do anything - //} - - public String getName() { - return "IndelStatistics"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if (eval != null && eval.isPolymorphicInSamples()) { - if ( indelStats == null ) { - indelStats = new IndelStats(eval); - } - if ( indelClasses == null ) { - indelClasses = new IndelClasses(eval); - } - - if ( eval.isIndel() || eval.isMixed() ) { - if (indelStats != null ) - indelStats.incrValue(eval, ref); - - if (indelClasses != null) - indelClasses.incrValue(eval, ref); - } - } - - return null; // This module doesn't capture any interesting sites, so return null - } - - public void finalizeEvaluation() { - int k=0; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java new file mode 100644 index 000000000..dda7e8611 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +@Analysis(description = "Evaluation summary for indels") +public class IndelSummary extends VariantEvaluator implements StandardEval { + final protected static Logger logger = Logger.getLogger(IndelSummary.class); + + // + // counts of snps and indels + // + @DataPoint(description = "Number of SNPs", format = "%d") + public int n_SNPs = 0; + + @DataPoint(description = "Number of singleton SNPs", format = "%d") + public int n_singleton_SNPs = 0; + + @DataPoint(description = "Number of indels", format = "%d") + public int n_indels = 0; + + @DataPoint(description = "Number of singleton indels", format = "%d") + public int n_singleton_indels = 0; + + // + // gold standard + // + @DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d") + public int n_indels_matching_gold_standard = 0; + + @DataPoint(description = "Percent of indels overlapping gold standard sites") + public String gold_standard_matching_rate; + + // + // multi-allelics + // + // Number of Indels Sites (counts one for any number of alleles at site) + public int nIndelSites = 0; + + @DataPoint(description = "Number of sites with where the number of alleles is greater than 2") + public int n_multiallelic_indel_sites = 0; + + @DataPoint(description = "Percent of indel sites that are multi-allelic") + public String percent_of_sites_with_more_than_2_alleles; + + // + // snp : indel ratios + // + @DataPoint(description = "SNP to indel ratio") + public String SNP_to_indel_ratio; + + @DataPoint(description = "Singleton SNP to indel ratio") + public String SNP_to_indel_ratio_for_singletons; + + // + // novelty + // + @DataPoint(description = "Number of novel indels", format = "%d") + public int n_novel_indels = 0; + + @DataPoint(description = "Indel novelty rate") + public String indel_novelty_rate; + + // + // insertions to deletions + // + @DataPoint(description = "Number of insertion indels") + public int n_insertions = 0; + + @DataPoint(description = "Number of deletion indels") + public int n_deletions = 0; + + @DataPoint(description = "Insertion to deletion ratio") + public String insertion_to_deletion_ratio; + + @DataPoint(description = "Number of large (>10 bp) deletions") + public int n_large_deletions = 0; + + @DataPoint(description = "Number of large (>10 bp) insertions") + public int n_large_insertions = 0; + + @DataPoint(description = "Ratio of large (>10 bp) insertions to deletions") + public String insertion_to_deletion_ratio_for_large_indels; + + // + // Frameshifts + // + @DataPoint(description = "Number of indels in protein-coding regions labeled as frameshift") + public int n_coding_indels_frameshifting = 0; + + @DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift") + public int n_coding_indels_in_frame = 0; + + @DataPoint(description = "Frameshift percent") + public String frameshift_rate_for_coding_indels; + + // + // Het : hom ratios + // + @DataPoint(description = "Het to hom ratio for SNPs") + public String SNP_het_to_hom_ratio; + + @DataPoint(description = "Het to hom ratio for indels") + public String indel_het_to_hom_ratio; + + int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0; + + int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used + int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used + + // - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a + // downstream frameshift, if we make the simplifying assumptions that 3 bp ins + // and 3bp del (adding/subtracting 1 AA in general) are roughly comparably + // selected against, we should see a consistent 1+2 : 3 bp ratio for insertions + // as for deletions, and certainly would expect consistency between in/dels that + // multiple methods find and in/dels that are unique to one method (since deletions + // are more common and the artifacts differ, it is probably worth looking at the totals, + // overlaps and ratios for insertions and deletions separately in the methods + // comparison and in this case don't even need to make the simplifying in = del functional assumption + + @DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions") + public String ratio_of_1_and_2_to_3_bp_insertions; + + @DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions") + public String ratio_of_1_and_2_to_3_bp_deletions; + + public final static int LARGE_INDEL_SIZE_THRESHOLD = 10; + + @Override public int getComparisonOrder() { return 2; } + + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) ) + return; + + // update counts + switch ( eval.getType() ) { + case SNP: + n_SNPs += eval.getNAlleles() - 1; // -1 for ref + if ( variantWasSingleton(eval) ) n_singleton_SNPs++; + + // collect information about het / hom ratio + for ( final Genotype g : eval.getGenotypes() ) { + if ( g.isHet() ) nSNPHets++; + if ( g.isHomVar() ) nSNPHoms++; + } + break; + case INDEL: + final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard); + + nIndelSites++; + if ( ! eval.isBiallelic() ) n_multiallelic_indel_sites++; + + // collect information about het / hom ratio + for ( final Genotype g : eval.getGenotypes() ) { + if ( g.isHet() ) nIndelHets++; + if ( g.isHomVar() ) nIndelHoms++; + } + + for ( Allele alt : eval.getAlternateAlleles() ) { + n_indels++; // +1 for each alt allele + if ( variantWasSingleton(eval) ) n_singleton_indels++; + if ( comp == null ) n_novel_indels++; // TODO -- make this test allele specific? + if ( gold != null ) n_indels_matching_gold_standard++; + + // ins : del ratios + final int alleleSize = alt.length() - eval.getReference().length(); + if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference()); + if ( alleleSize > 0 ) n_insertions++; + if ( alleleSize < 0 ) n_deletions++; + + // requires snpEFF annotations + if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) { + final String effect = eval.getAttributeAsString("SNPEFF_EFFECT", "missing"); + if ( effect.equals("missing") ) + throw new ReviewedStingException("Saw SNPEFF_GENE_BIOTYPE but unexpected no SNPEFF_EFFECT at " + eval); + if ( effect.equals("FRAME_SHIFT") ) + n_coding_indels_frameshifting++; + else if ( effect.startsWith("CODON") ) + n_coding_indels_in_frame++; + else + ; // lots of protein coding effects that shouldn't be counted, such as INTRON + } + + if ( alleleSize > LARGE_INDEL_SIZE_THRESHOLD ) + n_large_insertions++; + else if ( alleleSize < -LARGE_INDEL_SIZE_THRESHOLD ) + n_large_deletions++; + + // update the baby histogram + final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength; + final int absSize = Math.abs(alleleSize); + if ( absSize < countByLength.length ) countByLength[absSize]++; + + } + + break; + default: + // TODO - MIXED, SYMBOLIC, and MNP records are skipped over + //throw new UserException.BadInput("Unexpected variant context type: " + eval); + break; + } + + return; + } + + public void finalizeEvaluation() { + percent_of_sites_with_more_than_2_alleles = Utils.formattedPercent(n_multiallelic_indel_sites, nIndelSites); + SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels); + SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels); + + gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels); + indel_novelty_rate = Utils.formattedNoveltyRate(n_indels - n_novel_indels, n_indels); + frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting); + + ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]); + ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]); + + SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms); + indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms); + + insertion_to_deletion_ratio = Utils.formattedRatio(n_insertions, n_deletions); + insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions); + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index 363f5665f..ff3bf66f7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -1,17 +1,15 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.MendelianViolation; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.PrintStream; -import java.util.ArrayList; import java.util.Map; import java.util.Set; @@ -44,98 +42,74 @@ import java.util.Set; @Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator") public class MendelianViolationEvaluator extends VariantEvaluator { - @DataPoint(description = "Number of variants found with at least one family having genotypes") - long nVariants; - @DataPoint(description = "Number of variants found with no family having genotypes -- these sites do not count in the nNoCall") - long nSkipped; - @DataPoint(description="Number of variants x families called (no missing genotype or lowqual)") - long nFamCalled; - @DataPoint(description="Number of variants x families called (no missing genotype or lowqual) that contain at least one var allele.") - long nVarFamCalled; - @DataPoint(description="Number of variants x families discarded as low quality") - long nLowQual; - @DataPoint(description="Number of variants x families discarded as no call") - long nNoCall; - @DataPoint(description="Number of loci with mendelian violations") - long nLociViolations; - @DataPoint(description = "Number of mendelian violations found") - long nViolations; + @DataPoint(description = "Number of variants found with at least one family having genotypes", format = "%d") + public long nVariants; + @DataPoint(description = "Number of variants found with no family having genotypes -- these sites do not count in the nNoCall", format = "%d") + public long nSkipped; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual)", format = "%d") + public long nFamCalled; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual) that contain at least one var allele.", format = "%d") + public long nVarFamCalled; + @DataPoint(description="Number of variants x families discarded as low quality", format = "%d") + public long nLowQual; + @DataPoint(description="Number of variants x families discarded as no call", format = "%d") + public long nNoCall; + @DataPoint(description="Number of loci with mendelian violations", format = "%d") + public long nLociViolations; + @DataPoint(description = "Number of mendelian violations found", format = "%d") + public long nViolations; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HOM_VAR", format = "%d") + public long mvRefRef_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HET", format = "%d") + public long mvRefRef_Het; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HET -> HOM_VAR", format = "%d") + public long mvRefHet_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_VAR", format = "%d") + public long mvRefVar_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_REF", format = "%d") + public long mvRefVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HET -> HOM_REF", format = "%d") + public long mvVarHet_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HOM_REF", format = "%d") + public long mvVarVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HET", format = "%d") + public long mvVarVar_Het; - /*@DataPoint(description = "number of child hom ref calls where the parent was hom variant") - long KidHomRef_ParentHomVar; - @DataPoint(description = "number of child het calls where the parent was hom ref") - long KidHet_ParentsHomRef; - @DataPoint(description = "number of child het calls where the parent was hom variant") - long KidHet_ParentsHomVar; - @DataPoint(description = "number of child hom variant calls where the parent was hom ref") - long KidHomVar_ParentHomRef; - */ - - @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HOM_VAR") - long mvRefRef_Var; - @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HET") - long mvRefRef_Het; - @DataPoint(description="Number of mendelian violations of the type HOM_REF/HET -> HOM_VAR") - long mvRefHet_Var; - @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_VAR") - long mvRefVar_Var; - @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_REF") - long mvRefVar_Ref; - @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HET -> HOM_REF") - long mvVarHet_Ref; - @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HOM_REF") - long mvVarVar_Ref; - @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HET") - long mvVarVar_Het; - - - /*@DataPoint(description ="Number of inherited var alleles from het parents") - long nInheritedVar; - @DataPoint(description ="Number of inherited ref alleles from het parents") - long nInheritedRef;*/ - - @DataPoint(description="Number of HomRef/HomRef/HomRef trios") - long HomRefHomRef_HomRef; - @DataPoint(description="Number of Het/Het/Het trios") - long HetHet_Het; - @DataPoint(description="Number of Het/Het/HomRef trios") - long HetHet_HomRef; - @DataPoint(description="Number of Het/Het/HomVar trios") - long HetHet_HomVar; - @DataPoint(description="Number of HomVar/HomVar/HomVar trios") - long HomVarHomVar_HomVar; - @DataPoint(description="Number of HomRef/HomVar/Het trios") - long HomRefHomVAR_Het; - @DataPoint(description="Number of ref alleles inherited from het/het parents") - long HetHet_inheritedRef; - @DataPoint(description="Number of var alleles inherited from het/het parents") - long HetHet_inheritedVar; - @DataPoint(description="Number of ref alleles inherited from homRef/het parents") - long HomRefHet_inheritedRef; - @DataPoint(description="Number of var alleles inherited from homRef/het parents") - long HomRefHet_inheritedVar; - @DataPoint(description="Number of ref alleles inherited from homVar/het parents") - long HomVarHet_inheritedRef; - @DataPoint(description="Number of var alleles inherited from homVar/het parents") - long HomVarHet_inheritedVar; + @DataPoint(description="Number of HomRef/HomRef/HomRef trios", format = "%d") + public long HomRefHomRef_HomRef; + @DataPoint(description="Number of Het/Het/Het trios", format = "%d") + public long HetHet_Het; + @DataPoint(description="Number of Het/Het/HomRef trios", format = "%d") + public long HetHet_HomRef; + @DataPoint(description="Number of Het/Het/HomVar trios", format = "%d") + public long HetHet_HomVar; + @DataPoint(description="Number of HomVar/HomVar/HomVar trios", format = "%d") + public long HomVarHomVar_HomVar; + @DataPoint(description="Number of HomRef/HomVar/Het trios", format = "%d") + public long HomRefHomVAR_Het; + @DataPoint(description="Number of ref alleles inherited from het/het parents", format = "%d") + public long HetHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from het/het parents", format = "%d") + public long HetHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homRef/het parents", format = "%d") + public long HomRefHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homRef/het parents", format = "%d") + public long HomRefHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homVar/het parents", format = "%d") + public long HomVarHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homVar/het parents", format = "%d") + public long HomVarHet_inheritedVar; MendelianViolation mv; - PrintStream mvFile; Map> families; public void initialize(VariantEvalWalker walker) { - //Changed by Laurent Francioli - 2011-06-07 - //mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + super.initialize(walker); mv = new MendelianViolation(walker.getMendelianViolationQualThreshold(),false); families = walker.getSampleDB().getFamilies(); } - public boolean enabled() { - //return getVEWalker().FAMILY_STRUCTURE != null; - return true; - } - public String getName() { return "mendelian_violations"; } @@ -144,7 +118,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator { return 1; // we only need to see each eval track } - public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci if(mv.countViolations(families,vc)>0){ @@ -183,11 +157,6 @@ public class MendelianViolationEvaluator extends VariantEvaluator { else{ nSkipped++; } - - - return null; } - - return null; // we don't capture any interesting sites } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java index 97aebc376..7efb1d823 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -28,19 +28,16 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.util.*; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @Analysis(description = "Evaluation summary for multi-allelic variants") -public class MultiallelicSummary extends VariantEvaluator { // implements StandardEval { +public class MultiallelicSummary extends VariantEvaluator implements StandardEval { final protected static Logger logger = Logger.getLogger(MultiallelicSummary.class); public enum Type { @@ -48,119 +45,55 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa } // basic counts on various rates found - @DataPoint(description = "Number of processed loci") + @DataPoint(description = "Number of processed loci", format = "%d") public long nProcessedLoci = 0; - @DataPoint(description = "Number of SNPs") + @DataPoint(description = "Number of SNPs", format = "%d") public int nSNPs = 0; - @DataPoint(description = "Number of multi-allelic SNPs") + @DataPoint(description = "Number of multi-allelic SNPs", format = "%d") public int nMultiSNPs = 0; @DataPoint(description = "% processed sites that are multi-allelic SNPs", format = "%.5f") public double processedMultiSnpRatio = 0; @DataPoint(description = "% SNP sites that are multi-allelic", format = "%.3f") public double variantMultiSnpRatio = 0; - @DataPoint(description = "Number of Indels") + @DataPoint(description = "Number of Indels", format = "%d") public int nIndels = 0; - @DataPoint(description = "Number of multi-allelic Indels") + @DataPoint(description = "Number of multi-allelic Indels", format = "%d") public int nMultiIndels = 0; @DataPoint(description = "% processed sites that are multi-allelic Indels", format = "%.5f") public double processedMultiIndelRatio = 0; @DataPoint(description = "% Indel sites that are multi-allelic", format = "%.3f") public double variantMultiIndelRatio = 0; - @DataPoint(description = "Number of Transitions") + @DataPoint(description = "Number of Transitions", format = "%d") public int nTi = 0; - @DataPoint(description = "Number of Transversions") + @DataPoint(description = "Number of Transversions", format = "%d") public int nTv = 0; @DataPoint(description = "Overall TiTv ratio", format = "%.2f") public double TiTvRatio = 0; - @DataPoint(description = "Multi-allelic SNPs partially known") + @DataPoint(description = "Multi-allelic SNPs partially known", format = "%d") public int knownSNPsPartial = 0; - @DataPoint(description = "Multi-allelic SNPs completely known") + @DataPoint(description = "Multi-allelic SNPs completely known", format = "%d") public int knownSNPsComplete = 0; @DataPoint(description = "Multi-allelic SNP Novelty Rate") public String SNPNoveltyRate = "NA"; //TODO -- implement me - //@DataPoint(description = "Multi-allelic Indels partially known") + //@DataPoint(description = "Multi-allelic Indels partially known", format = "%d") public int knownIndelsPartial = 0; - //@DataPoint(description = "Multi-allelic Indels completely known") + //@DataPoint(description = "Multi-allelic Indels completely known", format = "%d") public int knownIndelsComplete = 0; //@DataPoint(description = "Multi-allelic Indel Novelty Rate") public String indelNoveltyRate = "NA"; - @DataPoint(description="Histogram of allele frequencies for most common SNP alternate allele") - AFHistogram AFhistogramMaxSnp = new AFHistogram(); - @DataPoint(description="Histogram of allele frequencies for less common SNP alternate alleles") - AFHistogram AFhistogramMinSnp = new AFHistogram(); + @Override public int getComparisonOrder() { return 2; } - @DataPoint(description="Histogram of allele frequencies for most common Indel alternate allele") - AFHistogram AFhistogramMaxIndel = new AFHistogram(); - - @DataPoint(description="Histogram of allele frequencies for less common Indel alternate alleles") - AFHistogram AFhistogramMinIndel = new AFHistogram(); - - /* - * AF histogram table object - */ - static class AFHistogram implements TableType { - private Object[] rowKeys, colKeys = {"count"}; - private int[] AFhistogram; - - private static final double AFincrement = 0.01; - private static final int numBins = (int)(1.00 / AFincrement); - - public AFHistogram() { - rowKeys = initRowKeys(); - AFhistogram = new int[rowKeys.length]; - } - - public Object[] getColumnKeys() { - return colKeys; - } - - public Object[] getRowKeys() { - return rowKeys; - } - - public Object getCell(int row, int col) { - return AFhistogram[row]; - } - - private static Object[] initRowKeys() { - ArrayList keyList = new ArrayList(numBins + 1); - for ( double a = 0.00; a <= 1.01; a += AFincrement ) { - keyList.add(String.format("%.2f", a)); - } - return keyList.toArray(); - } - - public String getName() { return "AFHistTable"; } - - public void update(final double AF) { - final int bin = (int)(numBins * MathUtils.round(AF, 2)); - AFhistogram[bin]++; - } - } - - public void initialize(VariantEvalWalker walker) {} - - @Override public boolean enabled() { return true; } - - public int getComparisonOrder() { - return 2; - } - - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval == null || eval.isMonomorphicInSamples() ) - return null; + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) ) + return; // update counts switch ( eval.getType() ) { @@ -170,7 +103,6 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa nMultiSNPs++; calculatePairwiseTiTv(eval); calculateSNPPairwiseNovelty(eval, comp); - updateAFhistogram(eval, AFhistogramMaxSnp, AFhistogramMinSnp); } break; case INDEL: @@ -178,14 +110,13 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa if ( !eval.isBiallelic() ) { nMultiIndels++; calculateIndelPairwiseNovelty(eval, comp); - updateAFhistogram(eval, AFhistogramMaxIndel, AFhistogramMinIndel); } break; default: throw new UserException.BadInput("Unexpected variant context type: " + eval); } - return null; // we don't capture any interesting sites + return; } private void calculatePairwiseTiTv(VariantContext vc) { @@ -214,33 +145,11 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa } private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) { - } - - private void updateAFhistogram(VariantContext vc, AFHistogram max, AFHistogram min) { - - final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null); - if ( obj == null || !(obj instanceof List) ) - return; - - List list = (List)obj; - ArrayList AFs = new ArrayList(list.size()); - for ( String str : list ) { - AFs.add(Double.valueOf(str)); - } - - Collections.sort(AFs); - max.update(AFs.get(AFs.size()-1)); - for ( int i = 0; i < AFs.size() - 1; i++ ) - min.update(AFs.get(i)); - } - - private final String noveltyRate(final int all, final int known) { - final int novel = all - known; - final double rate = (novel / (1.0 * all)); - return all == 0 ? "NA" : String.format("%.2f", rate); + // TODO -- implement me } public void finalizeEvaluation() { + nProcessedLoci = getWalker().getnProcessedLoci(); processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; processedMultiIndelRatio = (double)nMultiIndels / (double)nProcessedLoci; @@ -248,7 +157,7 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa TiTvRatio = (double)nTi / (double)nTv; - SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); - indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); + SNPNoveltyRate = Utils.formattedNoveltyRate(knownSNPsPartial + knownSNPsComplete, nMultiSNPs); + indelNoveltyRate = Utils.formattedNoveltyRate(knownIndelsPartial + knownIndelsComplete, nMultiSNPs); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java deleted file mode 100755 index ab1f410f9..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PhaseStats.java +++ /dev/null @@ -1,54 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -/** - * Created by IntelliJ IDEA. User: kiran Date: Nov 29, 2010 Time: 3:25:59 PM To change this template use File | Settings - * | File Templates. - */ -class NewPhaseStats { - public int neitherPhased; - public int onlyCompPhased; - public int onlyEvalPhased; - public int phasesAgree; - public int phasesDisagree; - - public NewPhaseStats() { - this.neitherPhased = 0; - this.onlyCompPhased = 0; - this.onlyEvalPhased = 0; - this.phasesAgree = 0; - this.phasesDisagree = 0; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("Neither phased: " + neitherPhased + "\tOnly Comp: " + onlyCompPhased + "\tOnly Eval: " + onlyEvalPhased + "\tSame phase: " + phasesAgree + "\tOpposite phase: " + phasesDisagree); - return sb.toString(); - } - - public static String[] getFieldNamesArray() { - return new String[]{"total", "neither", "only_comp", "only_eval", "both", "match", "switch", "switch_rate"}; - } - - public Object getField(int index) { - switch (index) { - case (0): - return (neitherPhased + onlyCompPhased + onlyEvalPhased + phasesAgree + phasesDisagree); - case (1): - return neitherPhased; - case (2): - return onlyCompPhased; - case (3): - return onlyEvalPhased; - case (4): - return (phasesAgree + phasesDisagree); - case (5): - return phasesAgree; - case (6): - return phasesDisagree; - case (7): - return ((phasesDisagree == 0) ? 0 : ((double) phasesDisagree) / (phasesAgree + phasesDisagree)); - default: - return -1; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java index b209ee13d..a0cb662e0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/PrintMissingComp.java @@ -33,12 +33,8 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; @Analysis(name = "PrintMissingComp", description = "the overlap between eval and comp sites") public class PrintMissingComp extends VariantEvaluator { - @DataPoint(description = "number of eval sites outside of comp sites") - long nMissing = 0; - - //public PrintMissingComp(VariantEvalWalker parent) { - // super(parent); - //} + @DataPoint(description = "number of eval sites outside of comp sites", format = "%d") + public long nMissing = 0; public String getName() { return "PrintMissingComp"; @@ -48,20 +44,13 @@ public class PrintMissingComp extends VariantEvaluator { return 2; // we need to see each eval track and each comp track } - public boolean enabled() { - return true; - } - - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean compIsGood = comp != null && comp.isNotFiltered() && comp.isSNP(); - boolean evalIsGood = eval != null && eval.isSNP(); + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final boolean compIsGood = comp != null && comp.isNotFiltered() && comp.isSNP(); + final boolean evalIsGood = eval != null && eval.isSNP(); if ( compIsGood & ! evalIsGood ) { nMissing++; - return "MissingFrom" + comp.getSource(); - } else { - return null; + super.getWalker().getLogger().info("MissingFrom" + eval.toString() + " is missing from " + comp.getSource()); } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java deleted file mode 100755 index 751f61a97..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SamplePreviousGenotypes.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -import java.util.HashMap; - -/** - * Created by IntelliJ IDEA. User: kiran Date: Nov 29, 2010 Time: 3:25:59 PM To change this template use File | Settings - * | File Templates. - */ -class NewSamplePreviousGenotypes { - private HashMap sampleGenotypes = null; - - public NewSamplePreviousGenotypes() { - this.sampleGenotypes = new HashMap(); - } - - public CompEvalGenotypes get(String sample) { - return sampleGenotypes.get(sample); - } - - public void put(String sample, CompEvalGenotypes compEvalGts) { - sampleGenotypes.put(sample, compEvalGts); - } - - public void put(String sample, GenomeLoc locus, Genotype compGt, Genotype evalGt) { - sampleGenotypes.put(sample, new CompEvalGenotypes(locus, compGt, evalGt)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index bb7843361..88bf3aef9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -14,31 +14,27 @@ import java.util.concurrent.ConcurrentMap; @Analysis(description = "Computes different estimates of theta based on variant sites and genotypes") public class ThetaVariantEvaluator extends VariantEvaluator { - @DataPoint(description = "Average heterozygosity at variant sites; note that missing genotypes are ignored when computing this value") - double avgHet = 0.0; - @DataPoint(description = "Average pairwise differences at aligned sequences; averaged over both number of sequeneces and number of variant sites; note that missing genotypes are ignored when computing this value") - double avgAvgDiffs = 0.0; - @DataPoint(description = "Sum of heterozygosity over all variant sites; divide this by total target to get estimate of per base theta") - double totalHet = 0.0; - @DataPoint(description = "Sum of pairwise diffs over all variant sites; divide this by total target to get estimate of per base theta") - double totalAvgDiffs = 0.0; - @DataPoint(description = "Theta for entire region estimated based on number of segregating sites; divide ths by total target to get estimate of per base theta") - double thetaRegionNumSites = 0.0; + @DataPoint(description = "Average heterozygosity at variant sites; note that missing genotypes are ignored when computing this value", format = "%.8f") + public double avgHet = 0.0; + @DataPoint(description = "Average pairwise differences at aligned sequences; averaged over both number of sequeneces and number of variant sites; note that missing genotypes are ignored when computing this value", format = "%.8f") + public double avgAvgDiffs = 0.0; + @DataPoint(description = "Sum of heterozygosity over all variant sites; divide this by total target to get estimate of per base theta", format = "%.8f") + public double totalHet = 0.0; + @DataPoint(description = "Sum of pairwise diffs over all variant sites; divide this by total target to get estimate of per base theta", format = "%.8f") + public double totalAvgDiffs = 0.0; + @DataPoint(description = "Theta for entire region estimated based on number of segregating sites; divide ths by total target to get estimate of per base theta", format = "%.8f") + public double thetaRegionNumSites = 0.0; //helper variables double numSites = 0; - public boolean enabled() { - return true; - } - public int getComparisonOrder() { return 1; } - public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphicInSamples()) { - return null; //no interesting sites + public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (vc == null || !vc.isSNP() || (getWalker().ignoreAC0Sites() && vc.isMonomorphicInSamples())) { + return; } //this maps allele to a count @@ -107,8 +103,6 @@ public class ThetaVariantEvaluator extends VariantEvaluator { this.totalAvgDiffs += numDiffs / numPairwise; } } - - return null; } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 9de850d82..6c4fcd26d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -11,29 +11,24 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @Analysis(description = "Ti/Tv Variant Evaluator") public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEval { - - @DataPoint(description = "number of transition loci") - long nTi = 0; - @DataPoint(description = "number of transversion loci") - long nTv = 0; + @DataPoint(description = "number of transition loci", format = "%d") + public long nTi = 0; + @DataPoint(description = "number of transversion loci", format = "%d") + public long nTv = 0; @DataPoint(description = "the transition to transversion ratio", format = "%.2f") - double tiTvRatio = 0.0; - @DataPoint(description = "number of comp transition sites") - long nTiInComp = 0; - @DataPoint(description = "number of comp transversion sites") - long nTvInComp = 0; + public double tiTvRatio = 0.0; + @DataPoint(description = "number of comp transition sites", format = "%d") + public long nTiInComp = 0; + @DataPoint(description = "number of comp transversion sites", format = "%d") + public long nTvInComp = 0; @DataPoint(description = "the transition to transversion ratio for comp sites", format = "%.2f") - double TiTvRatioStandard = 0.0; - @DataPoint(description = "number of derived transition loci") - long nTiDerived = 0; - @DataPoint(description = "number of derived transversion loci") - long nTvDerived = 0; + public double TiTvRatioStandard = 0.0; + @DataPoint(description = "number of derived transition loci", format = "%d") + public long nTiDerived = 0; + @DataPoint(description = "number of derived transversion loci", format = "%d") + public long nTvDerived = 0; @DataPoint(description = "the derived transition to transversion ratio", format = "%.2f") - double tiTvDerivedRatio = 0.0; - - public boolean enabled() { - return true; - } + public double tiTvDerivedRatio = 0.0; public int getComparisonOrder() { return 2; // we only need to see each eval track @@ -62,11 +57,9 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } } - public String update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (vc1 != null) updateTiTv(vc1, false); if (vc2 != null) updateTiTv(vc2, true); - - return null; // we don't capture any interesting sites } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 86d3467fb..bf457f5c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -24,29 +24,29 @@ import java.util.Collection; @Analysis(description = "Assess site accuracy and sensitivity of callset against follow-up validation assay") public class ValidationReport extends VariantEvaluator implements StandardEval { // todo -- note this isn't strictly allele away. It's really focused on sites. A/T call at a validated A/G site is currently counted as a TP - @DataPoint(description = "nComp") int nComp = 0; - @DataPoint(description = "TP") int TP = 0; - @DataPoint(description = "FP") int FP = 0; - @DataPoint(description = "FN") int FN = 0; - @DataPoint(description = "TN") int TN = 0; + @DataPoint(description = "nComp", format = "%d") public int nComp = 0; + @DataPoint(description = "TP", format = "%d") public int TP = 0; + @DataPoint(description = "FP", format = "%d") public int FP = 0; + @DataPoint(description = "FN", format = "%d") public int FN = 0; + @DataPoint(description = "TN", format = "%d") public int TN = 0; - @DataPoint(description = "Sensitivity", format = "%.2f") double sensitivity = 0; - @DataPoint(description = "Specificity", format = "%.2f") double specificity = 0; - @DataPoint(description = "PPV", format = "%.2f") double PPV = 0; - @DataPoint(description = "FDR", format = "%.2f") double FDR = 0; + @DataPoint(description = "Sensitivity", format = "%.2f") public double sensitivity = 0; + @DataPoint(description = "Specificity", format = "%.2f") public double specificity = 0; + @DataPoint(description = "PPV", format = "%.2f") public double PPV = 0; + @DataPoint(description = "FDR", format = "%.2f") public double FDR = 0; - @DataPoint(description = "CompMonoEvalNoCall") int CompMonoEvalNoCall = 0; - @DataPoint(description = "CompMonoEvalFiltered") int CompMonoEvalFiltered = 0; - @DataPoint(description = "CompMonoEvalMono") int CompMonoEvalMono = 0; - @DataPoint(description = "CompMonoEvalPoly") int CompMonoEvalPoly = 0; + @DataPoint(description = "CompMonoEvalNoCall", format = "%d") public int CompMonoEvalNoCall = 0; + @DataPoint(description = "CompMonoEvalFiltered", format = "%d") public int CompMonoEvalFiltered = 0; + @DataPoint(description = "CompMonoEvalMono", format = "%d") public int CompMonoEvalMono = 0; + @DataPoint(description = "CompMonoEvalPoly", format = "%d") public int CompMonoEvalPoly = 0; - @DataPoint(description = "CompPolyEvalNoCall") int CompPolyEvalNoCall = 0; - @DataPoint(description = "CompPolyEvalFiltered") int CompPolyEvalFiltered = 0; - @DataPoint(description = "CompPolyEvalMono") int CompPolyEvalMono = 0; - @DataPoint(description = "CompPolyEvalPoly") int CompPolyEvalPoly = 0; + @DataPoint(description = "CompPolyEvalNoCall", format = "%d") public int CompPolyEvalNoCall = 0; + @DataPoint(description = "CompPolyEvalFiltered", format = "%d") public int CompPolyEvalFiltered = 0; + @DataPoint(description = "CompPolyEvalMono", format = "%d") public int CompPolyEvalMono = 0; + @DataPoint(description = "CompPolyEvalPoly", format = "%d") public int CompPolyEvalPoly = 0; - @DataPoint(description = "CompFiltered") int CompFiltered = 0; - @DataPoint(description = "Eval and comp have different alleles") int nDifferentAlleleSites = 0; + @DataPoint(description = "CompFiltered", format = "%d") public int CompFiltered = 0; + @DataPoint(description = "Eval and comp have different alleles", format = "%d") public int nDifferentAlleleSites = 0; private static final boolean TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED = true; private static final boolean REQUIRE_IDENTICAL_ALLELES = false; @@ -57,7 +57,6 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { final int[][] counts = new int[SiteStatus.values().length][SiteStatus.values().length]; @Override public int getComparisonOrder() { return 2; } - @Override public boolean enabled() { return true; } @Override public void finalizeEvaluation() { @@ -97,7 +96,7 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { } @Override - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( comp != null ) { // we only need to consider sites in comp if ( REQUIRE_IDENTICAL_ALLELES && (eval != null && haveDifferentAltAlleles(eval, comp))) nDifferentAlleleSites++; @@ -107,8 +106,6 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { counts[compStatus.ordinal()][evalStatus.ordinal()]++; } } - - return null; // we don't capture any interesting sites } // diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index 83a1c2f3b..df4c3e860 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -4,31 +4,36 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.NewEvaluationContext; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.StateKey; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Collection; +public abstract class VariantEvaluator implements Comparable { + private VariantEvalWalker walker; + private final String simpleName; -public abstract class VariantEvaluator { - public void initialize(VariantEvalWalker walker) {} + protected VariantEvaluator() { + this.simpleName = getClass().getSimpleName(); + } - public abstract boolean enabled(); + public void initialize(VariantEvalWalker walker) { + this.walker = walker; + } + + public VariantEvalWalker getWalker() { + return walker; + } // Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 public abstract int getComparisonOrder(); // called at all sites, regardless of eval context itself; useful for counting processed bases - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + // No longer available. The processed bp is kept in VEW itself for performance reasons + // public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public void update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return null; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return null; + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } public void finalizeEvaluation() {} @@ -45,8 +50,59 @@ public abstract class VariantEvaluator { return ((double)num) / (Math.max(denom, 1)); } - public boolean stateIsApplicable(StateKey stateKey) { - return true; + /** + * Returns true if the variant in vc was a singleton in the original input evaluation + * set, regardless of variant context subsetting that has occurred. + * @param eval the VariantContext being assessed for this previous status as a singleton + * @return true if eval was originally a singleton site + */ + protected static boolean variantWasSingleton(final VariantContext eval) { + return eval.getAttributeAsBoolean(VariantEvalWalker.IS_SINGLETON_KEY, false); } + public final String getSimpleName() { + return simpleName; + } + + @Override + public int compareTo(final VariantEvaluator variantEvaluator) { + return getSimpleName().compareTo(variantEvaluator.getSimpleName()); + } + + /** + * Evaluation modules that override this function to indicate that they support + * combining the results of two independent collections of eval data into + * a single meaningful result. The purpose of this interface is to + * allow us to cut up the input data into many independent stratifications, and then + * at the end of the eval run decide which stratifications to combine. This is + * important in the case of AC, where you may have thousands of distinct AC + * values that chop up the number of variants to too small a number of variants, + * and you'd like to combine the AC values into ranges containing some percent + * of the data. + * + * For example, suppose you have an eval that + * counts variants in a variable nVariants. If you want to be able to combine + * multiple evaluations of this type, overload the combine function + * with a function that sets this.nVariants += other.nVariants. + * + * Add in the appropriate fields of the VariantEvaluator T + * (of the same type as this object) to the values of this object. + * + * The values in this and other are implicitly independent, so that + * the values can be added together. + * + * @param other a VariantEvaluator of the same type of this object + */ + public void combine(final VariantEvaluator other) { + throw new ReviewedStingException(getSimpleName() + " doesn't support combining results, sorry"); + } + + /** + * Must be overloaded to return true for evaluation modules that support the combine operation + * + * @return + */ + public boolean supportsCombine() { + return false; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index ce9e45c9b..347ca56b8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -44,215 +43,207 @@ import java.util.HashMap; * @since Apr 6, 2010 */ -@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") -public class VariantQualityScore extends VariantEvaluator { +//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") +@Deprecated +public class VariantQualityScore { + // TODO - this should really be a stratification - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(description = "the Ti/Tv ratio broken out by variant quality") - TiTvStats titvStats = null; - - @DataPoint(description = "average variant quality for each allele count") - AlleleCountStats alleleCountStats = null; - - static class TiTvStats implements TableType { - final static int NUM_BINS = 20; - final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately - final long transitionByQuality[] = new long[NUM_BINS]; - final long transversionByQuality[] = new long[NUM_BINS]; - final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out - - public Object[] getRowKeys() { - return new String[]{"sample"}; - } - - public Object[] getColumnKeys() { - final String columnKeys[] = new String[NUM_BINS]; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - columnKeys[iii] = "titvBin" + iii; - } - return columnKeys; - } - - public String getName() { - return "TiTvStats"; - } - - public String getCell(int x, int y) { - return String.valueOf(titvByQuality[y]); - } - - public String toString() { - StringBuffer returnString = new StringBuffer(); - // output the ti/tv array - returnString.append("titvByQuality: "); - for( int iii = 0; iii < NUM_BINS; iii++ ) { - returnString.append(titvByQuality[iii]); - returnString.append(" "); - } - return returnString.toString(); - } - - public void incrValue( final double qual, final boolean isTransition ) { - final Integer qualKey = Math.round((float) qual); - final long numTransition = (isTransition ? 1L : 0L); - final long numTransversion = (isTransition ? 0L : 1L); - if( qualByIsTransition.containsKey(qualKey) ) { - Pair transitionPair = qualByIsTransition.get(qualKey); - transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); - qualByIsTransition.put(qualKey, transitionPair); - } else { - qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); - } - } - - public void organizeTiTvTables() { - for( int iii = 0; iii < NUM_BINS; iii++ ) { - transitionByQuality[iii] = 0L; - transversionByQuality[iii] = 0L; - titvByQuality[iii] = 0.0; - } - - int maxQual = 0; - - // Calculate the maximum quality score in order to normalize and histogram - for( final Integer qual : qualByIsTransition.keySet() ) { - if( qual > maxQual ) { - maxQual = qual; - } - } - - final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); - - for( final Integer qual : qualByIsTransition.keySet() ) { - final int index = (int)Math.floor( ((double) qual) / binSize ); - if( index >= 0 ) { // BUGBUG: why is there overflow here? - Pair transitionPair = qualByIsTransition.get(qual); - transitionByQuality[index] += transitionPair.getFirst(); - transversionByQuality[index] += transitionPair.getSecond(); - } - } - - for( int iii = 0; iii < NUM_BINS; iii++ ) { - if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio - titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); - } else { - titvByQuality[iii] = 0.0; - } - } - - } - } - - class AlleleCountStats implements TableType { - final HashMap> qualityListMap = new HashMap>(); - final HashMap qualityMap = new HashMap(); - - public Object[] getRowKeys() { - final int NUM_BINS = qualityListMap.keySet().size(); - final String rowKeys[] = new String[NUM_BINS]; - int iii = 0; - for( final Integer key : qualityListMap.keySet() ) { - rowKeys[iii] = "AC" + key; - iii++; - } - return rowKeys; - - } - - public Object[] getColumnKeys() { - return new String[]{"alleleCount","avgQual"}; - } - - public String getName() { - return "AlleleCountStats"; - } - - public String getCell(int x, int y) { - int iii = 0; - for( final Integer key : qualityListMap.keySet() ) { - if(iii == x) { - if(y == 0) { return String.valueOf(key); } - else { return String.valueOf(qualityMap.get(key)); } - } - iii++; - } - return null; - } - - public String toString() { - String returnString = ""; - // output the quality map - returnString += "AlleleCountStats: "; - //for( int iii = 0; iii < NUM_BINS; iii++ ) { - // returnString += titvByQuality[iii] + " "; - //} - return returnString; - } - - public void incrValue( final double qual, final int alleleCount ) { - ArrayList list = qualityListMap.get(alleleCount); - if(list==null) { list = new ArrayList(); } - list.add(qual); - qualityListMap.put(alleleCount, list); - } - - public void organizeAlleleCountTables() { - for( final Integer key : qualityListMap.keySet() ) { - final ArrayList list = qualityListMap.get(key); - double meanQual = 0.0; - final double numQuals = (double)list.size(); - for( Double qual : list ) { - meanQual += qual / numQuals; - } - qualityMap.put(key, meanQual); - } - } - } - - //public VariantQualityScore(VariantEvalWalker parent) { - //super(parent); - //} - - public String getName() { - return "VariantQualityScore"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String interesting = null; - - if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) - if( titvStats == null ) { titvStats = new TiTvStats(); } - titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); - - if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } - int alternateAlleleCount = 0; - for (final Allele a : eval.getAlternateAlleles()) { - alternateAlleleCount += eval.getCalledChrCount(a); - } - alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - public void finalizeEvaluation() { - if( titvStats != null ) { - titvStats.organizeTiTvTables(); - } - if( alleleCountStats != null ) { - alleleCountStats.organizeAlleleCountTables(); - } - } +// public class VariantQualityScore extends VariantEvaluator { +// +// // a mapping from quality score histogram bin to Ti/Tv ratio +// @DataPoint(description = "the Ti/Tv ratio broken out by variant quality") +// TiTvStats titvStats = null; +// +// @DataPoint(description = "average variant quality for each allele count") +// AlleleCountStats alleleCountStats = null; +// +// static class TiTvStats extends TableType { +// final static int NUM_BINS = 20; +// final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately +// final long transitionByQuality[] = new long[NUM_BINS]; +// final long transversionByQuality[] = new long[NUM_BINS]; +// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out +// +// public Object[] getRowKeys() { +// return new String[]{"sample"}; +// } +// +// public Object[] getColumnKeys() { +// final String columnKeys[] = new String[NUM_BINS]; +// for( int iii = 0; iii < NUM_BINS; iii++ ) { +// columnKeys[iii] = "titvBin" + iii; +// } +// return columnKeys; +// } +// +// public String getCell(int x, int y) { +// return String.valueOf(titvByQuality[y]); +// } +// +// public String toString() { +// StringBuffer returnString = new StringBuffer(); +// // output the ti/tv array +// returnString.append("titvByQuality: "); +// for( int iii = 0; iii < NUM_BINS; iii++ ) { +// returnString.append(titvByQuality[iii]); +// returnString.append(" "); +// } +// return returnString.toString(); +// } +// +// public void incrValue( final double qual, final boolean isTransition ) { +// final Integer qualKey = Math.round((float) qual); +// final long numTransition = (isTransition ? 1L : 0L); +// final long numTransversion = (isTransition ? 0L : 1L); +// if( qualByIsTransition.containsKey(qualKey) ) { +// Pair transitionPair = qualByIsTransition.get(qualKey); +// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); +// qualByIsTransition.put(qualKey, transitionPair); +// } else { +// qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); +// } +// } +// +// public void organizeTiTvTables() { +// for( int iii = 0; iii < NUM_BINS; iii++ ) { +// transitionByQuality[iii] = 0L; +// transversionByQuality[iii] = 0L; +// titvByQuality[iii] = 0.0; +// } +// +// int maxQual = 0; +// +// // Calculate the maximum quality score in order to normalize and histogram +// for( final Integer qual : qualByIsTransition.keySet() ) { +// if( qual > maxQual ) { +// maxQual = qual; +// } +// } +// +// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); +// +// for( final Integer qual : qualByIsTransition.keySet() ) { +// final int index = (int)Math.floor( ((double) qual) / binSize ); +// if( index >= 0 ) { // BUGBUG: why is there overflow here? +// Pair transitionPair = qualByIsTransition.get(qual); +// transitionByQuality[index] += transitionPair.getFirst(); +// transversionByQuality[index] += transitionPair.getSecond(); +// } +// } +// +// for( int iii = 0; iii < NUM_BINS; iii++ ) { +// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio +// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); +// } else { +// titvByQuality[iii] = 0.0; +// } +// } +// +// } +// } +// +// class AlleleCountStats extends TableType { +// final HashMap> qualityListMap = new HashMap>(); +// final HashMap qualityMap = new HashMap(); +// +// public Object[] getRowKeys() { +// final int NUM_BINS = qualityListMap.keySet().size(); +// final String rowKeys[] = new String[NUM_BINS]; +// int iii = 0; +// for( final Integer key : qualityListMap.keySet() ) { +// rowKeys[iii] = "AC" + key; +// iii++; +// } +// return rowKeys; +// +// } +// +// public Object[] getColumnKeys() { +// return new String[]{"alleleCount","avgQual"}; +// } +// +// public String getCell(int x, int y) { +// int iii = 0; +// for( final Integer key : qualityListMap.keySet() ) { +// if(iii == x) { +// if(y == 0) { return String.valueOf(key); } +// else { return String.valueOf(qualityMap.get(key)); } +// } +// iii++; +// } +// return null; +// } +// +// public String toString() { +// String returnString = ""; +// // output the quality map +// returnString += "AlleleCountStats: "; +// //for( int iii = 0; iii < NUM_BINS; iii++ ) { +// // returnString += titvByQuality[iii] + " "; +// //} +// return returnString; +// } +// +// public void incrValue( final double qual, final int alleleCount ) { +// ArrayList list = qualityListMap.get(alleleCount); +// if(list==null) { list = new ArrayList(); } +// list.add(qual); +// qualityListMap.put(alleleCount, list); +// } +// +// public void organizeAlleleCountTables() { +// for( final Integer key : qualityListMap.keySet() ) { +// final ArrayList list = qualityListMap.get(key); +// double meanQual = 0.0; +// final double numQuals = (double)list.size(); +// for( Double qual : list ) { +// meanQual += qual / numQuals; +// } +// qualityMap.put(key, meanQual); +// } +// } +// } +// +// //public VariantQualityScore(VariantEvalWalker parent) { +// //super(parent); +// //} +// +// public String getName() { +// return "VariantQualityScore"; +// } +// +// public int getComparisonOrder() { +// return 1; // we only need to see each eval track +// } +// +// public String toString() { +// return getName(); +// } +// +// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { +// final String interesting = null; +// +// if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) +// if( titvStats == null ) { titvStats = new TiTvStats(); } +// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); +// +// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } +// int alternateAlleleCount = 0; +// for (final Allele a : eval.getAlternateAlleles()) { +// alternateAlleleCount += eval.getCalledChrCount(a); +// } +// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); +// } +// +// return interesting; // This module doesn't capture any interesting sites, so return null +// } +// +// public void finalizeEvaluation() { +// if( titvStats != null ) { +// titvStats.organizeTiTvTables(); +// } +// if( alleleCountStats != null ) { +// alleleCountStats.organizeAlleleCountTables(); +// } +// } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java index 3c7c6f00c..8766bb14e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -49,7 +50,6 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { /** Indels with size greater than this value are tallied in the CNV column */ private final static int MAX_INDEL_LENGTH = 50; private final static double MIN_CNV_OVERLAP = 0.5; - private VariantEvalWalker walker; public enum Type { SNP, INDEL, CNV @@ -58,39 +58,39 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { Map> knownCNVs = null; // basic counts on various rates found - @DataPoint(description = "Number of samples") + @DataPoint(description = "Number of samples", format = "%d") public long nSamples = 0; - @DataPoint(description = "Number of processed loci") + @DataPoint(description = "Number of processed loci", format = "%d") public long nProcessedLoci = 0; - @DataPoint(description = "Number of SNPs") + @DataPoint(description = "Number of SNPs", format = "%d") public long nSNPs = 0; @DataPoint(description = "Overall TiTv ratio", format = "%.2f") public double TiTvRatio = 0; - @DataPoint(description = "SNP Novelty Rate") + @DataPoint(description = "SNP Novelty Rate", format = "%s") public String SNPNoveltyRate = "NA"; - @DataPoint(description = "Mean number of SNPs per individual") + @DataPoint(description = "Mean number of SNPs per individual", format = "%d") public long nSNPsPerSample = 0; @DataPoint(description = "Mean TiTv ratio per individual", format = "%.2f") public double TiTvRatioPerSample = 0; @DataPoint(description = "Mean depth of coverage per sample at SNPs", format = "%.1f") public double SNPDPPerSample = 0; - @DataPoint(description = "Number of Indels") + @DataPoint(description = "Number of Indels", format = "%d") public long nIndels = 0; - @DataPoint(description = "Indel Novelty Rate") + @DataPoint(description = "Indel Novelty Rate", format = "%s") public String IndelNoveltyRate = "NA"; - @DataPoint(description = "Mean number of Indels per individual") + @DataPoint(description = "Mean number of Indels per individual", format = "%d") public long nIndelsPerSample = 0; @DataPoint(description = "Mean depth of coverage per sample at Indels", format = "%.1f") public double IndelDPPerSample = 0; - @DataPoint(description = "Number of SVs") + @DataPoint(description = "Number of SVs", format = "%d") public long nSVs = 0; - @DataPoint(description = "SV Novelty Rate") + @DataPoint(description = "SV Novelty Rate", format = "%s") public String SVNoveltyRate = "NA"; - @DataPoint(description = "Mean number of SVs per individual") + @DataPoint(description = "Mean number of SVs per individual", format = "%d") public long nSVsPerSample = 0; TypeSampleMap allVariantCounts, knownVariantCounts; @@ -126,7 +126,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { long sum = 0; int n = 0; for ( final Map.Entry pair : get(type).entrySet() ) { - if ( pair.getKey() != ALL) { + if ( pair.getKey() != ALL) { // truly must be string == n++; sum += pair.getValue(); } @@ -138,7 +138,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { double sum = 0; int n = 0; for ( final String sample : get(type).keySet() ) { - if ( (allP && sample == ALL) || (!allP && sample != ALL) ) { + if ( (allP && sample == ALL) || (!allP && sample != ALL) ) { // truly must be string == final long num = get(type).get(sample); final long denom = denoms.get(type).get(sample); sum += ratio(num, denom); @@ -152,7 +152,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { public void initialize(VariantEvalWalker walker) { - this.walker = walker; + super.initialize(walker); nSamples = walker.getSampleNamesForEvaluation().size(); countsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); @@ -170,17 +170,11 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { } } - @Override public boolean enabled() { return true; } - public int getComparisonOrder() { return 2; // we only need to see each eval track } - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); - } - - private final Type getType(VariantContext vc) { + private Type getType(VariantContext vc) { switch (vc.getType()) { case SNP: return Type.SNP; @@ -196,9 +190,9 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { } } - private final boolean overlapsKnownCNV(VariantContext cnv) { + private boolean overlapsKnownCNV(VariantContext cnv) { if ( knownCNVs != null ) { - final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true); + final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true); IntervalTree intervalTree = knownCNVs.get(loc.getContig()); final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); @@ -212,8 +206,9 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { return false; } - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval == null || eval.isMonomorphicInSamples() ) return null; + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) ) + return; final Type type = getType(eval); @@ -248,19 +243,16 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { depthPerSample.inc(type, g.getSampleName()); } } - - return null; // we don't capture any interesting sites } - private final String noveltyRate(Type type) { + private String noveltyRate(Type type) { final int all = allVariantCounts.all(type); final int known = knownVariantCounts.all(type); - final int novel = all - known; - final double rate = (novel / (1.0 * all)); - return all == 0 ? "NA" : String.format("%.2f", rate); + return Utils.formattedNoveltyRate(known, all); } public void finalizeEvaluation() { + nProcessedLoci = getWalker().getnProcessedLoci(); nSNPs = allVariantCounts.all(Type.SNP); nIndels = allVariantCounts.all(Type.INDEL); nSVs = allVariantCounts.all(Type.CNV); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/genotypePhasingEvaluator/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/genotypePhasingEvaluator/GenotypePhasingEvaluator.java new file mode 100755 index 000000000..500ab8e65 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/genotypePhasingEvaluator/GenotypePhasingEvaluator.java @@ -0,0 +1,361 @@ +//package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.genotypePhasingEvaluator; +// +//import org.apache.log4j.Logger; +//import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +//import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +//import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +//import org.broadinstitute.sting.gatk.walkers.phasing.AllelePair; +//import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; +//import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +//import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +//import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +//import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +//import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +//import org.broadinstitute.sting.utils.GenomeLoc; +//import org.broadinstitute.sting.utils.MathUtils; +//import org.broadinstitute.sting.utils.variantcontext.Genotype; +//import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +//import org.broadinstitute.sting.utils.variantcontext.VariantContext; +// +//import java.util.HashMap; +//import java.util.HashSet; +//import java.util.Set; +// +///* +// * Copyright (c) 2010 The Broad Institute +// * +// * Permission is hereby granted, free of charge, to any person +// * obtaining a copy of this software and associated documentation +// * files (the "Software"), to deal in the Software without +// * restriction, including without limitation the rights to use, +// * copy, modify, merge, publish, distribute, sublicense, and/or sell +// * copies of the Software, and to permit persons to whom the +// * Software is furnished to do so, subject to the following +// * conditions: +// * +// * The above copyright notice and this permission notice shall be +// * included in all copies or substantial portions of the Software. +// * +// * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +// * THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// */ +// +//@Analysis(name = "Genotype Phasing Evaluation", description = "Evaluates the phasing of genotypes in different tracks") +//public class GenotypePhasingEvaluator extends VariantEvaluator { +// protected final static Logger logger = Logger.getLogger(GenotypePhasingEvaluator.class); +// +// // a mapping from sample to stats +// @DataPoint(description = "the phasing statistics for each sample") +// public SamplePhasingStatistics samplePhasingStatistics = null; +// +// SamplePreviousGenotypes samplePrevGenotypes = null; +// +// double minPhaseQuality = 10.0; +// +// public void initialize(VariantEvalWalker walker) { +// super.initialize(walker); +// this.samplePhasingStatistics = new SamplePhasingStatistics(walker.getMinPhaseQuality()); +// this.samplePrevGenotypes = new SamplePreviousGenotypes(); +// } +// +// public String getName() { +// return "GenotypePhasingEvaluator"; +// } +// +// public int getComparisonOrder() { +// return 2; // we only need to see pairs of (comp, eval) +// } +// +// public String toString() { +// return getName() + ":
"; +// } +// +// public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { +// update2(eval,comp,tracker,ref,context,null); +// } +// +// public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, EvaluationContext group) { +// //public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { +// Reasons interesting = new Reasons(); +// if (ref == null) +// return interesting.toString(); +// GenomeLoc curLocus = ref.getLocus(); +// +// logger.debug("update2() locus: " + curLocus); +// logger.debug("comp = " + comp + " eval = " + eval); +// +// Set allSamples = new HashSet(); +// +// GenotypesContext compSampGenotypes = null; +// if (isRelevantToPhasing(comp)) { +// allSamples.addAll(comp.getSampleNames()); +// compSampGenotypes = comp.getGenotypes(); +// } +// +// GenotypesContext evalSampGenotypes = null; +// if (isRelevantToPhasing(eval)) { +// allSamples.addAll(eval.getSampleNames()); +// evalSampGenotypes = eval.getGenotypes(); +// } +// +// for (String samp : allSamples) { +// logger.debug("sample = " + samp); +// +// Genotype compSampGt = null; +// if (compSampGenotypes != null) +// compSampGt = compSampGenotypes.get(samp); +// +// Genotype evalSampGt = null; +// if (evalSampGenotypes != null) +// evalSampGt = evalSampGenotypes.get(samp); +// +// if (compSampGt == null || evalSampGt == null || compSampGt.isNoCall() || evalSampGt.isNoCall()) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] +// // Having an unphased site breaks the phasing for the sample [does NOT permit "transitive phasing"] - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: +// if (isNonNullButUnphased(compSampGt) || isNonNullButUnphased(evalSampGt)) +// samplePrevGenotypes.put(samp, null); +// } +// else { // Both comp and eval have a non-null Genotype at this site: +// AllelePair compAllelePair = new AllelePair(compSampGt); +// AllelePair evalAllelePair = new AllelePair(evalSampGt); +// +// boolean breakPhasing = false; +// if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom()) +// breakPhasing = true; // since they are not both het or both hom +// else { // both are het, or both are hom: +// boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair)); +// boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair)); +// if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop) +// breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample +// } +// +// if (breakPhasing) { +// samplePrevGenotypes.put(samp, null); // nothing to do for this site, AND must remove any history for the future +// } +// else if (compSampGt.isHet() && evalSampGt.isHet()) { +// /* comp and eval have the HET same Genotype at this site: +// [Note that if both are hom, then nothing is done here, but the het history IS preserved]. +// */ +// CompEvalGenotypes prevCompAndEval = samplePrevGenotypes.get(samp); +// if (prevCompAndEval != null && !prevCompAndEval.getLocus().onSameContig(curLocus)) // exclude curLocus if it is "phased" relative to a different chromosome +// prevCompAndEval = null; +// +// // Replace the previous hets with the current hets: +// samplePrevGenotypes.put(samp, curLocus, compSampGt, evalSampGt); +// +// if (prevCompAndEval != null) { +// GenomeLoc prevLocus = prevCompAndEval.getLocus(); +// logger.debug("Potentially phaseable het locus: " + curLocus + " [relative to previous het locus: " + prevLocus + "]"); +// PhaseStats ps = samplePhasingStatistics.ensureSampleStats(samp); +// +// boolean compSampIsPhased = genotypesArePhasedAboveThreshold(compSampGt); +// boolean evalSampIsPhased = genotypesArePhasedAboveThreshold(evalSampGt); +// if (compSampIsPhased || evalSampIsPhased) { +// if (!evalSampIsPhased) { +// ps.onlyCompPhased++; +// //interesting.addReason("ONLY_COMP", samp, group, prevLocus, ""); +// } +// else if (!compSampIsPhased) { +// ps.onlyEvalPhased++; +// //interesting.addReason("ONLY_EVAL", samp, group, prevLocus, ""); +// } +// else { // both comp and eval are phased: +// AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye()); +// AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype()); +// +// // Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample: +// boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair)); +// boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair)); +// +// if (topsMatch || topMatchesBottom) { +// ps.phasesAgree++; +// +// Double compPQ = getPQ(compSampGt); +// Double evalPQ = getPQ(evalSampGt); +// if (compPQ != null && evalPQ != null && MathUtils.compareDoubles(compPQ, evalPQ) != 0) { +// //interesting.addReason("PQ_CHANGE", samp, group, prevLocus, compPQ + " -> " + evalPQ); +// } +// } +// else { +// ps.phasesDisagree++; +// logger.debug("SWITCHED locus: " + curLocus); +// //interesting.addReason("SWITCH", samp, group, prevLocus, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair)); +// } +// } +// } +// else { +// ps.neitherPhased++; +// } +// } +// } +// } +// } +// logger.debug("\n" + samplePhasingStatistics + "\n"); +// +// return interesting.toString(); +// } +// +// public static boolean isRelevantToPhasing(VariantContext vc) { +// return (vc != null && !vc.isFiltered()); +// } +// +// public boolean isNonNullButUnphased(Genotype gt) { +// return (gt != null && !gt.isNoCall() && !genotypesArePhasedAboveThreshold(gt)); +// } +// +// public boolean genotypesArePhasedAboveThreshold(Genotype gt) { +// if (gt.isHom()) // Can always consider a hom site to be phased to its predecessor, since its successor will only be phased to it if it's hom or "truly" phased +// return true; +// +// if (!gt.isPhased()) +// return false; +// +// Double pq = getPQ(gt); +// return (pq == null || pq >= minPhaseQuality); +// } +// +// public static Double getPQ(Genotype gt) { +// Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); +// return d == -1 ? null : d; +// } +// +// public static boolean topMatchesTop(AllelePair b1, AllelePair b2) { +// return b1.getTopAllele().equals(b2.getTopAllele()); +// } +// +// public static boolean topMatchesBottom(AllelePair b1, AllelePair b2) { +// return b1.getTopAllele().equals(b2.getBottomAllele()); +// } +// +// public static boolean bottomMatchesTop(AllelePair b1, AllelePair b2) { +// return topMatchesBottom(b2, b1); +// } +// +// public static boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) { +// return b1.getBottomAllele().equals(b2.getBottomAllele()); +// } +// +// public String toString(AllelePair prev, AllelePair cur) { +// return prev.getTopAllele().getBaseString() + "+" + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "+" + cur.getBottomAllele().getBaseString(); +// } +// +// public void finalizeEvaluation() { +// } +// +// private static class Reasons { +// private StringBuilder sb; +// +// public Reasons() { +// sb = new StringBuilder(); +// } +// +//// public void addReason(String category, String sample, VariantEvalWalker.EvaluationContext evalGroup, GenomeLoc prevLoc, String reason) { +//// sb.append(category + "(" + sample + ", previous: " + prevLoc + " [" + evalGroup.compTrackName + ", " + evalGroup.evalTrackName + "]): " + reason + ";"); +//// } +// +// public String toString() { +// if (sb.length() == 0) +// return null; +// +// return "reasons=" + sb.toString(); +// } +// } +//} +// +//class CompEvalGenotypes { +// private GenomeLoc loc; +// private Genotype compGt; +// private Genotype evalGt; +// +// public CompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { +// this.loc = loc; +// this.compGt = compGt; +// this.evalGt = evalGt; +// } +// +// public GenomeLoc getLocus() { +// return loc; +// } +// +// public Genotype getCompGenotpye() { +// return compGt; +// } +// public Genotype getEvalGenotype() { +// return evalGt; +// } +//} +// +//class SamplePreviousGenotypes { +// private HashMap sampleGenotypes = null; +// +// public SamplePreviousGenotypes() { +// this.sampleGenotypes = new HashMap(); +// } +// +// public CompEvalGenotypes get(String sample) { +// return sampleGenotypes.get(sample); +// } +// +// public void put(String sample, CompEvalGenotypes compEvalGts) { +// sampleGenotypes.put(sample, compEvalGts); +// } +// +// public void put(String sample, GenomeLoc locus, Genotype compGt, Genotype evalGt) { +// sampleGenotypes.put(sample, new CompEvalGenotypes(locus, compGt, evalGt)); +// } +//} +// +//class PhaseStats { +// public int neitherPhased; +// public int onlyCompPhased; +// public int onlyEvalPhased; +// public int phasesAgree; +// public int phasesDisagree; +// +// public PhaseStats() { +// this.neitherPhased = 0; +// this.onlyCompPhased = 0; +// this.onlyEvalPhased = 0; +// this.phasesAgree = 0; +// this.phasesDisagree = 0; +// } +// +// public String toString() { +// StringBuilder sb = new StringBuilder(); +// sb.append("Neither phased: " + neitherPhased + "\tOnly Comp: " + onlyCompPhased + "\tOnly Eval: " + onlyEvalPhased + "\tSame phase: " + phasesAgree + "\tOpposite phase: " + phasesDisagree); +// return sb.toString(); +// } +// +// public static String[] getFieldNamesArray() { +// return new String[]{"total", "neither", "only_comp", "only_eval", "both", "match", "switch", "switch_rate"}; +// } +// +// public Object getField(int index) { +// switch (index) { +// case (0): +// return (neitherPhased + onlyCompPhased + onlyEvalPhased + phasesAgree + phasesDisagree); +// case (1): +// return neitherPhased; +// case (2): +// return onlyCompPhased; +// case (3): +// return onlyEvalPhased; +// case (4): +// return (phasesAgree + phasesDisagree); +// case (5): +// return phasesAgree; +// case (6): +// return phasesDisagree; +// case (7): +// return ((phasesDisagree == 0) ? 0 : ((double) phasesDisagree) / (phasesAgree + phasesDisagree)); +// default: +// return -1; +// } +// } +//} +// diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/genotypePhasingEvaluator/SamplePhasingStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/genotypePhasingEvaluator/SamplePhasingStatistics.java new file mode 100644 index 000000000..6b81ce14c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/genotypePhasingEvaluator/SamplePhasingStatistics.java @@ -0,0 +1,89 @@ +///* +// * Copyright (c) 2012, The Broad Institute +// * +// * Permission is hereby granted, free of charge, to any person +// * obtaining a copy of this software and associated documentation +// * files (the "Software"), to deal in the Software without +// * restriction, including without limitation the rights to use, +// * copy, modify, merge, publish, distribute, sublicense, and/or sell +// * copies of the Software, and to permit persons to whom the +// * Software is furnished to do so, subject to the following +// * conditions: +// * +// * The above copyright notice and this permission notice shall be +// * included in all copies or substantial portions of the Software. +// * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// * OTHER DEALINGS IN THE SOFTWARE. +// */ +// +//package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.genotypePhasingEvaluator; +// +//import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; +// +//import java.util.HashMap; +//import java.util.Map; +// +///** +// * a table of sample names to genotype phasing statistics +// */ +//class SamplePhasingStatistics extends TableType { +// private HashMap sampleStats = null; +// private double minPhaseQuality; +// +// public SamplePhasingStatistics(double minPhaseQuality) { +// this.sampleStats = new HashMap(); +// this.minPhaseQuality = minPhaseQuality; +// } +// +// public PhaseStats ensureSampleStats(String samp) { +// PhaseStats ps = sampleStats.get(samp); +// if (ps == null) { +// ps = new PhaseStats(); +// sampleStats.put(samp, ps); +// } +// return ps; +// } +// +// /** +// * @return one row per sample +// */ +// public String[] getRowKeys() { +// return sampleStats.keySet().toArray(new String[sampleStats.size()]); +// } +// +// /** +// * get the column keys +// * +// * @return a list of objects, in this case strings, that are the column names +// */ +// public String[] getColumnKeys() { +// return PhaseStats.getFieldNamesArray(); +// } +// +// public Object getCell(int x, int y) { +// String[] rowKeys = getRowKeys(); +// PhaseStats ps = sampleStats.get(rowKeys[x]); +// return ps.getField(y); +// } +// +// public String getName() { +// return "Sample Phasing Statistics (for PQ >= " + minPhaseQuality + ")"; +// } +// +// public String toString() { +// StringBuilder sb = new StringBuilder(); +// for (Map.Entry sampPhaseStatsEnt : sampleStats.entrySet()) { +// String sample = sampPhaseStatsEnt.getKey(); +// PhaseStats ps = sampPhaseStatsEnt.getValue(); +// +// sb.append(sample + "\t" + ps); +// } +// return sb.toString(); +// } +//} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 2f342e120..7a3b85567 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -3,12 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantSummary; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * Stratifies the eval RODs by the allele count of the alternate allele @@ -19,10 +20,8 @@ import java.util.List; public class AlleleCount extends VariantStratifier { @Override public void initialize() { - List> evals = getVariantEvalWalker().getEvals(); - // we can only work with a single eval VCF, and it must have genotypes - if ( evals.size() != 1 ) + if ( getVariantEvalWalker().getEvals().size() != 1 && !getVariantEvalWalker().mergeEvals ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); // There are 2 x n sample chromosomes for diploids @@ -32,28 +31,36 @@ public class AlleleCount extends VariantStratifier { // create an array containing each of the allele counts for( int ac = 0; ac <= nchrom; ac++ ) { - states.add(String.format("%d", ac)); + states.add(ac); } getVariantEvalWalker().getLogger().info("AlleleCount using " + nchrom + " chromosomes"); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(1); - + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null) { - int AC = -1; + int AC = 0; // by default, the site is considered monomorphic + if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getCalledChrCount(allele)); - } else - // by default, the site is considered monomorphic - AC = 0; - relevantStates.add(String.format("%d", AC)); - } + } - return relevantStates; + return Collections.singletonList((Object) AC); + } else { + return Collections.emptyList(); + } + } + + @Override + public Set> getIncompatibleEvaluators() { + return new HashSet>(Arrays.asList(VariantSummary.class)); + } + + @Override + public String getFormat() { + return "%d"; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index cd2b8e475..817663026 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** @@ -17,23 +18,20 @@ import java.util.List; public class AlleleFrequency extends VariantStratifier { @Override public void initialize() { - states = new ArrayList(); for( double a = 0.000; a <= 1.005; a += 0.005 ) { states.add(String.format("%.3f", a)); } } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); - + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null) { try { - relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3)))); + return Collections.singletonList((Object)String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3)))); } catch (Exception e) { - return relevantStates; + return Collections.emptyList(); } } - return relevantStates; + return Collections.emptyList(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java index 1f31ebfa7..1274028d7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Collections; import java.util.List; @@ -15,16 +16,12 @@ import java.util.List; public class CompRod extends VariantStratifier implements RequiredStratification { @Override public void initialize() { - for ( RodBinding rod : getVariantEvalWalker().getComps() ) + for ( RodBinding rod : getVariantEvalWalker().getComps() ) { states.add(rod.getName()); + } } - - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); - - relevantStates.add(compName); - - return relevantStates; + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + return Collections.singletonList((Object)compName); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java index c45a73231..328bab1db 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java @@ -5,6 +5,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.List; /** @@ -17,14 +19,12 @@ public class Contig extends VariantStratifier { states.add("all"); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); - + @Override + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null) { - relevantStates.add("all"); - relevantStates.add(eval.getChr()); + return Arrays.asList((Object)"all", eval.getChr()); + } else { + return Collections.emptyList(); } - - return relevantStates; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java index 539cd21ef..7536b0237 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java @@ -27,7 +27,8 @@ public class CpG extends VariantStratifier { states.add("non_CpG"); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + @Override + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { boolean isCpG = false; if (ref != null && ref.getBases() != null) { String fwRefBases = new String(ref.getBases()); @@ -41,7 +42,7 @@ public class CpG extends VariantStratifier { } } - ArrayList relevantStates = new ArrayList(); + ArrayList relevantStates = new ArrayList(2); relevantStates.add("all"); relevantStates.add(isCpG ? "CpG" : "non_CpG"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index 91c96e490..eab59864f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -17,7 +17,6 @@ public class Degeneracy extends VariantStratifier { @Override public void initialize() { - states = new ArrayList(); states.add("1-fold"); states.add("2-fold"); states.add("3-fold"); @@ -79,8 +78,8 @@ public class Degeneracy extends VariantStratifier { } } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + ArrayList relevantStates = new ArrayList(); relevantStates.add("all"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java new file mode 100644 index 000000000..21255f7b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/DynamicStratification.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Tag this stratification as dynamically determining the final strat based on the input data + * + * The paradigm here is simple. We upfront create a strat with N states that reflect the finest grained + * possible division of the data. The data is processed, and statistics collected for each of the N states. + * An update call is made to the stratification for evaluation VariantContext during each map call, + * allowing the strat to collect data about the usage of each state. A final call requests that + * the stratification map down the N states into M states (typically less than N, not necessarily + * a subset of N). This is provided by returning a map from each of M state -> N states and + * the VariantEval walker will combine all of the evaluations for N into a single value for + * each M. + * + * For example, suppose I have a dynamic strat called AC, adopting 7 possible values 0,1,2,3,4,5,6. This + * strats tracks the number of eval vcs for each state, with final counts 0=1, 1=100, 2=10, 3=5, 4=3, 5=2, 6=1. + * The stratification attempts to combine the strats down to so that each state has approximately the same + * fraction of the data in each bin. Overall there is 1+100+10+5+3+2+1=124 observations and 7 bins so we really + * want ~ 18 observations in each bin. So we merge 3-6 with 5+3+2+1 = 11 and keep 2, 1, and 0 as distinct bins. We + * return a map from 0 -> 0, 1 -> 1, 2 -> 2, 3-6 -> {3,4,5,6}. + * + * TODO - some open implementation questions + * -- We should only create one stratifier overall. How do we track this? When we create the stratifiers + * perhaps we can look at them and create a tracker? + * -- How do we create a new stratifier based on the finalStratifications() given the framework? Conceptually + * this new thing is itself a stratifier, just like before, but it's states are determined at the end. We'd + * then like to call not getRelevantStates but a different function that accepts an old state and returns + * the new state. Perhaps the process should look like: + * finalizeStratification -> new Stratifier whose states are the final ones + * getNewState(old state) -> new state (one of those in getFinalStratification) + * + * @author Mark DePristo + * @since 4/9/12 + */ +public interface DynamicStratification { + public void update(final VariantContext eval); + public VariantStratifier finalizeStratification(); + public Object getFinalState(final Object oldState); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java index b2b6d4165..6328d6a51 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java @@ -15,7 +15,6 @@ import java.util.List; public class EvalRod extends VariantStratifier implements RequiredStratification { @Override public void initialize() { - states = new ArrayList(); for ( RodBinding rod : getVariantEvalWalker().getEvals() ) { states.add(rod.getName()); if ( getVariantEvalWalker().mergeEvals ) @@ -23,11 +22,7 @@ public class EvalRod extends VariantStratifier implements RequiredStratification } } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); - - relevantStates.add(evalName); - - return relevantStates; + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + return Arrays.asList((Object)evalName); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java index aacfae993..278ced713 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java @@ -18,8 +18,8 @@ public class Filter extends VariantStratifier { states.add("raw"); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + ArrayList relevantStates = new ArrayList(); relevantStates.add("raw"); if (eval != null) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index f5dcf527a..330451fff 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -28,8 +28,8 @@ public class FunctionalClass extends VariantStratifier { } -public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + ArrayList relevantStates = new ArrayList(); relevantStates.add("all"); @@ -52,8 +52,8 @@ public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker t try { FunctionalType newType = FunctionalType.valueOf(newtypeStr); if ( type == null || - ( type == FunctionalType.silent && newType != FunctionalType.silent ) || - ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) { + ( type == FunctionalType.silent && newType != FunctionalType.silent ) || + ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) { type = newType; } } catch ( Exception e ) {} // don't error out if the type isn't supported @@ -71,7 +71,7 @@ public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker t type = FunctionalType.missense; else if ( snpEffFunctionalClass == SnpEff.EffectFunctionalClass.SILENT ) type = FunctionalType.silent; - } + } catch ( Exception e ) {} // don't error out if the type isn't supported } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java index 1b9513b9a..01b10c502 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java @@ -2,10 +2,10 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** @@ -16,17 +16,15 @@ import java.util.List; */ public class IndelSize extends VariantStratifier { static final int MAX_INDEL_SIZE = 100; + @Override public void initialize() { - states = new ArrayList(); for( int a=-MAX_INDEL_SIZE; a <=MAX_INDEL_SIZE; a++ ) { - states.add(String.format("%d", a)); + states.add(a); } } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); - + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null && eval.isIndel() && eval.isBiallelic()) { try { int eventLength = 0; @@ -41,12 +39,12 @@ public class IndelSize extends VariantStratifier { else if (eventLength < -MAX_INDEL_SIZE) eventLength = -MAX_INDEL_SIZE; - relevantStates.add(String.format("%d",eventLength)); + return Collections.singletonList((Object)eventLength); } catch (Exception e) { - return relevantStates; + return Collections.emptyList(); } } - return relevantStates; + return Collections.emptyList(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java index d91422a7e..4fc381b3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -26,13 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import net.sf.picard.util.IntervalTree; import org.apache.log4j.Logger; -import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -56,6 +52,10 @@ public class IntervalStratification extends VariantStratifier { final protected static Logger logger = Logger.getLogger(IntervalStratification.class); Map> intervalTreeByContig = null; + final List OVERLAPPING = Arrays.asList((Object)"all", (Object)"overlaps.intervals"); + final List NOT_OVERLAPPING = Arrays.asList((Object)"all", (Object)"outside.intervals"); + + @Override public void initialize() { if ( getVariantEvalWalker().intervalsFile == null ) @@ -71,20 +71,21 @@ public class IntervalStratification extends VariantStratifier { logger.info(String.format("Creating IntervalStratification %s containing %d intervals covering %d bp", getVariantEvalWalker().intervalsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); - states = new ArrayList(Arrays.asList("all", "overlaps.intervals", "outside.intervals")); + states.addAll(Arrays.asList("all", "overlaps.intervals", "outside.intervals")); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - final ArrayList relevantStates = new ArrayList(Arrays.asList("all")); - + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null) { - final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true); + final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval, true); IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); //logger.info(String.format("Overlap %s found %s", loc, node)); - relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals"); + if ( node != null ) + return OVERLAPPING; + else + return NOT_OVERLAPPING; } - return relevantStates; + return Collections.emptyList(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java index c0cab4534..dc5438358 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java @@ -29,8 +29,8 @@ public class JexlExpression extends VariantStratifier implements StandardStratif } } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + ArrayList relevantStates = new ArrayList(); relevantStates.add("none"); for ( SortableJexlVCMatchExp jexlExpression : jexlExpressions ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java index 77d98d33b..693bdf198 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java @@ -14,24 +14,26 @@ public class Novelty extends VariantStratifier implements StandardStratification // needs the variant contexts and known names private List> knowns; + private final static List KNOWN_STATES = Arrays.asList((Object)"all", (Object)"known"); + private final static List NOVEL_STATES = Arrays.asList((Object)"all", (Object)"novel"); @Override public void initialize() { - states = new ArrayList(Arrays.asList("all", "known", "novel")); + states.addAll(Arrays.asList("all", "known", "novel")); knowns = getVariantEvalWalker().getKnowns(); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (tracker != null && eval != null) { final Collection knownComps = tracker.getValues(knowns, ref.getLocus()); for ( final VariantContext c : knownComps ) { // loop over sites, looking for something that matches the type eval if ( eval.getType() == c.getType() ) { - return Arrays.asList("all", "known"); + return KNOWN_STATES; } } - } - - return Arrays.asList("all", "novel"); + } + + return NOVEL_STATES; } -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java new file mode 100644 index 000000000..65633bc2b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/OneBPIndel.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Stratifies the eval RODs into sites where the indel is 1 bp in length and those where the event is 2+. + * all non indel events go into all bins, so that SNP counts can be used as contrasts in eval modules. + */ +public class OneBPIndel extends VariantStratifier { + private final static List ALL = Arrays.asList((Object)"all", (Object)"one.bp", (Object)"two.plus.bp"); + private final static List ONE_BP = Arrays.asList((Object)"all", (Object)"one.bp"); + private final static List TWO_PLUS_BP = Arrays.asList((Object)"all", (Object)"two.plus.bp"); + + @Override + public void initialize() { + states.addAll(ALL); + } + + @Override + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + if (eval != null && eval.isIndel()) { + for ( int l : eval.getIndelLengths() ) + if ( Math.abs(l) > 1 ) + return TWO_PLUS_BP; // someone is too long + return ONE_BP; // all lengths are one + } else + return ALL; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java index c697b5b7a..621f4337f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java @@ -2,10 +2,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantSummary; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.List; +import java.util.*; /** * Stratifies the eval RODs by each sample in the eval ROD. @@ -20,7 +21,12 @@ public class Sample extends VariantStratifier { states.addAll(getVariantEvalWalker().getSampleNamesForStratification()); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - return Arrays.asList(sampleName); + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + return Collections.singletonList((Object) sampleName); + } + + @Override + public Set> getIncompatibleEvaluators() { + return new HashSet>(Arrays.asList(VariantSummary.class)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java new file mode 100644 index 000000000..834c02b83 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/TandemRepeat.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Arrays; +import java.util.List; + +/** + * Stratifies the eval RODs into sites that are tandem repeats + */ +public class TandemRepeat extends VariantStratifier { + private final static List JUST_ALL = Arrays.asList((Object)"all"); + private final static List ALL = Arrays.asList((Object)"all", (Object)"is.repeat", (Object)"not.repeat"); + private final static List REPEAT = Arrays.asList((Object)"all", (Object)"is.repeat"); + private final static List NOT_REPEAT = Arrays.asList((Object)"all", (Object)"not.repeat"); + + @Override + public void initialize() { + states.addAll(ALL); + } + + @Override + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + if ( eval == null || ! eval.isIndel() ) + return ALL; + else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) { + print("REPEAT", eval, ref); + return REPEAT; + } else { + print("NOT A REPEAT", eval, ref); + return NOT_REPEAT; + } + } + + private final void print(String prefix, VariantContext eval, ReferenceContext ref) { +// String alleles = ParsingUtils.sortList(eval.getAlleles()).toString(); +// this.getVariantEvalWalker().getLogger().info(prefix + ": " + "pos=" + eval.getStart() + " alleles=" + alleles + " ref=" + new String(ref.getForwardBases())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java index 119a1b83f..07ba424a2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java @@ -3,25 +3,44 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.Stratifier; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Arrays; +import java.util.Collections; import java.util.List; +import java.util.Set; -public abstract class VariantStratifier implements Comparable { +public abstract class VariantStratifier implements Comparable, Stratifier { private VariantEvalWalker variantEvalWalker; final private String name; - protected ArrayList states = new ArrayList(); + final protected ArrayList states = new ArrayList(); protected VariantStratifier() { name = this.getClass().getSimpleName(); } + // ------------------------------------------------------------------------------------- + // + // to be overloaded + // + // ------------------------------------------------------------------------------------- + + public abstract void initialize(); + + public abstract List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName); + + // ------------------------------------------------------------------------------------- + // + // final capabilities + // + // ------------------------------------------------------------------------------------- + /** * @return a reference to the parent VariantEvalWalker running this stratification */ - public VariantEvalWalker getVariantEvalWalker() { + public final VariantEvalWalker getVariantEvalWalker() { return variantEvalWalker; } @@ -29,25 +48,38 @@ public abstract class VariantStratifier implements Comparable * Should only be called by VariantEvalWalker itself * @param variantEvalWalker */ - public void setVariantEvalWalker(VariantEvalWalker variantEvalWalker) { + public final void setVariantEvalWalker(VariantEvalWalker variantEvalWalker) { this.variantEvalWalker = variantEvalWalker; } - public abstract void initialize(); - - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - return null; + public final int compareTo(VariantStratifier o1) { + return this.getName().compareTo(o1.getName()); } - public int compareTo(VariantStratifier o1) { - return this.getName().compareTo(o1.getName()); + @Override + public String toString() { + return getName(); } public final String getName() { return name; } - - public ArrayList getAllStates() { + + public String getFormat() { return "%s"; } + + public final ArrayList getAllStates() { return states; } + + + /** + * The way for a stratifier to specify that it's incompatible with specific evaluations. For + * example, VariantSummary includes a per-sample metric, and so cannot be used safely with Sample + * or AlleleCount stratifications as this introduces an O(n^2) memory and cpu cost. + * + * @return the set of VariantEvaluators that cannot be active with this Stratification + */ + public Set> getIncompatibleEvaluators() { + return Collections.emptySet(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java index 7d25498a5..a9be7c3c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java @@ -38,12 +38,10 @@ import java.util.List; public class VariantType extends VariantStratifier { @Override public void initialize() { - for ( VariantContext.Type t : VariantContext.Type.values() ) { - states.add(t.toString()); - } + states.addAll(Arrays.asList(VariantContext.Type.values())); } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - return eval == null ? Collections.emptyList() : Arrays.asList(eval.getType().toString()); + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + return eval == null ? Collections.emptyList() : Collections.singletonList((Object)eval.getType()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java new file mode 100644 index 000000000..2bcb20e8e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNode.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Helper class representing a tree of stratification splits, where leaf nodes + * are given a unique integer key starting at 0 and incrementing up to the + * number of leaves in the tree. This allows you to use this tree to produce + * a key to map into an array index mapped data structure. + * + * Suppose I have to strats, each with two values: A = 1, 2 and B = 3, 4 + * + * This data structure creates a tree such as: + * + * root -> A -> 1 -> B -> 3 : 0 + * |- B -> 4 : 1 + * |- A -> 2 -> B -> 3 : 2 + * |- B -> 4 : 3 + * + * This code allows us to efficiently look up a state key (A=2, B=3) and map it + * to a specific key (an integer) that's unique over the tree + * + * Note the structure of this tree is that the keys are -1 for all internal nodes, and + * leafs are the only nodes with meaningful keys. So for a tree with 2N nodes N of these + * will be internal, with no keys, and meaningful maps from states -> subtrees. The + * other N nodes are leafs, with meaningful keys, empty maps, and null stratification objects + * + * @author Mark DePristo + * @since 3/27/12 + */ +@Invariant({ + "(isLeaf() && stratifier == null && subnodes.isEmpty()) || (!isLeaf() && stratifier != null && !subnodes.isEmpty())"}) +class StratNode implements Iterable> { + int key = -1; + final T stratifier; + final Map> subnodes; // NOTE, because we don't iterate our best option is a HashMap + + protected StratNode() { + this.subnodes = Collections.emptyMap(); + this.stratifier = null; + } + + protected StratNode(final T stratifier, final Map> subnodes) { + this.stratifier = stratifier; + // important to reallocate an unmodififable hashmap with this specific size for space and safety + this.subnodes = Collections.unmodifiableMap(new HashMap>(subnodes)); + } + + @Requires("key >= 0") + public void setKey(final int key) { + if ( ! isLeaf() ) + throw new ReviewedStingException("Cannot set key of non-leaf node"); + this.key = key; + } + + @Requires({ + "states != null", + "offset >= 0", + "offset <= states.size()" + }) + public int find(final List states, int offset) { + if ( isLeaf() ) // we're here! + return key; + else { + final Object state = states.get(offset); + StratNode subnode = subnodes.get(state); + if ( subnode == null ) + return -1; + else + return subnode.find(states, offset+1); + } + } + + @Requires({ + "multipleStates != null", + "offset >= 0", + "offset <= multipleStates.size()", + "keys != null", + "offset == multipleStates.size() || multipleStates.get(offset) != null"}) + public void find(final List> multipleStates, final int offset, final HashSet keys) { + if ( isLeaf() ) // we're here! + keys.add(key); + else { + for ( final Object state : multipleStates.get(offset) ) { + // loop over all of the states at this offset + final StratNode subnode = subnodes.get(state); + if ( subnode == null ) + throw new ReviewedStingException("Couldn't find state for " + state + " at node " + this); + else + subnode.find(multipleStates, offset+1, keys); + } + } + } + + @Ensures("result >= 0") + public int getKey() { + if ( ! isLeaf() ) + throw new ReviewedStingException("Cannot get key of non-leaf node"); + else + return key; + } + + protected Map> getSubnodes() { + return subnodes; + } + + @Ensures("result >= 0") + public int size() { + if ( isLeaf() ) + return 1; + else { + return subnodes.values().iterator().next().size() * subnodes.size(); + } + } + + public T getSetOfStates() { + return stratifier; + } + + /** + * @return true if this node is a leaf + */ + public boolean isLeaf() { + return stratifier == null; + } + + /** + * Returns an iterator over this node and all subnodes including internal and leaf nodes + * @return + */ + @Override + @Ensures("result != null") + public Iterator> iterator() { + return new StratNodeIterator(this); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java new file mode 100644 index 000000000..3aff4fe27 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratNodeIterator.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Helper class for creating iterators over all nodes in the stratification tree + * + * @author Mark DePristo + * @since 3/27/12 + */ +class StratNodeIterator implements Iterator> { + Queue>> iterators = new LinkedList>>(); + Iterator> currentIterator; + + StratNodeIterator(final StratNode root) { + currentIterator = Collections.singleton(root).iterator(); + for ( final StratNode subNode : root.subnodes.values() ) + iterators.add(new StratNodeIterator(subNode)); + } + + @Override + public boolean hasNext() { + return currentIterator.hasNext() || ! iterators.isEmpty(); + } + + @Override + public StratNode next() { + if ( currentIterator.hasNext() ) + return currentIterator.next(); + else if ( ! iterators.isEmpty() ) { + currentIterator = iterators.poll(); + return next(); + } else { + throw new IllegalStateException("Next called on empty iterator"); + } + } + + @Override + public void remove() { + throw new ReviewedStingException("Cannot remove from StratNode iterator"); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java new file mode 100644 index 000000000..5e8db8107 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManager.java @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Represents the full state space of all stratification combinations + * + * @author Mark DePristo + * @since 3/27/12 + */ +public class StratificationManager implements Map, V> { + private final StratNode root; + private final int size; + + private final ArrayList stratifiers; + + // values associated with each key + private final ArrayList valuesByKey; + private final ArrayList> stratifierValuesByKey; + private final ArrayList keyStrings; + + // ------------------------------------------------------------------------------------- + // + // creating the manager + // + // ------------------------------------------------------------------------------------- + + /** + * Create a new StratificationManager with nodes to store data for all combinations + * of the ordered list of strats + * + * @param strats ordered list of stratifications to representation + */ + @Requires("!strats.isEmpty()") + public StratificationManager(final List strats) { + this.stratifiers = new ArrayList(strats); + + // construct and store the full tree of strats + this.root = buildStratificationTree(new LinkedList(strats)); + // assign the linear key ordering to the leafs + assignKeys(root); + + // cache the size, and check for a bad state + this.size = root.size(); + if ( this.size == 0 ) + throw new ReviewedStingException("Size == 0 in StratificationManager"); + + // prepare the assocated data vectors mapping from key -> data + this.valuesByKey = new ArrayList(size()); + this.stratifierValuesByKey = new ArrayList>(size()); + this.keyStrings = new ArrayList(size()); + for ( int i = 0; i < size(); i++ ) { + this.valuesByKey.add(null); + this.stratifierValuesByKey.add(null); + this.keyStrings.add(null); + } + + assignStratifierValuesByKey(root); + } + + /** + * Recursive construction helper for main constructor that fills into the + * complete tree of StratNodes. This function returns the complete tree + * suitable for associating data with each combinatino of keys. Note + * that the tree is not fully complete as the keys are not yet set for + * each note (see assignStratifierValuesByKey) + * + * @param strats + * @return + */ + private StratNode buildStratificationTree(final Queue strats) { + final K first = strats.poll(); + if ( first == null ) { + // we are at a leaf + return new StratNode(); + } else { + // we are in the middle of the tree + final Collection states = first.getAllStates(); + + if ( states.isEmpty() ) + throw new ReviewedStingException("State " + first + " is empty!"); + + final LinkedHashMap> subNodes = new LinkedHashMap>(states.size()); + for ( final Object state : states ) { + // have to copy because poll modifies the queue + final Queue copy = new LinkedList(strats); + subNodes.put(state, buildStratificationTree(copy)); + } + return new StratNode(first, subNodes); + } + } + + /** + * Set the key for each leaf from root, in order from 0 to N - 1 for N leaves in the tree + * @param root + */ + @Requires("root == this.root") + private void assignKeys(final StratNode root) { + int key = 0; + for ( final StratNode node : root ) { + if ( node.isLeaf() ) + node.setKey(key++); + } + } + + /** + * Entry point to recursive tool that fills in the list of state values corresponding + * to each key. After this function is called you can map from key -> List of StateValues + * instead of walking the tree to find the key and reading the list of state values + * + * @param root + */ + private void assignStratifierValuesByKey(final StratNode root) { + assignStratifierValuesByKey(root, new LinkedList()); + + // do a last sanity check that no key has null value after assigning + for ( List stateValues : stratifierValuesByKey ) + if ( stateValues == null ) + throw new ReviewedStingException("Found a null state value set that's null"); + } + + private void assignStratifierValuesByKey(final StratNode node, final LinkedList states) { + if ( node.isLeaf() ) { // we're here! + if ( states.isEmpty() ) + throw new ReviewedStingException("Found a leaf node with an empty state values vector"); + stratifierValuesByKey.set(node.getKey(), Collections.unmodifiableList(new ArrayList(states))); + } else { + for ( Map.Entry> entry : node.getSubnodes().entrySet() ) { + final LinkedList newStates = new LinkedList(states); + newStates.addLast(entry.getKey()); + assignStratifierValuesByKey(entry.getValue(), newStates); + } + } + } + + // ------------------------------------------------------------------------------------- + // + // simple accessors + // + // ------------------------------------------------------------------------------------- + + /** + * How many states are held in this stratification manager? + * @return + */ + @Ensures("result >= 0") + public int size() { + return size; + } + + @Ensures("result != null") + protected StratNode getRoot() { + return root; + } + + @Ensures("result != null") + public List getStratifiers() { + return stratifiers; + } + + // ------------------------------------------------------------------------------------- + // + // mapping from states -> keys + // + // ------------------------------------------------------------------------------------- + + @Requires("states != null") + @Ensures("result >= -1") + public int getKey(final List states) { + return root.find(states, 0); + } + + @Requires("allStates != null") + @Ensures("result != null") + public Set getKeys(final List> allStates) { + final HashSet keys = new HashSet(); + root.find(allStates, 0, keys); + return keys; + } + + public List getStatesForKey(final int key) { + final List states = new ArrayList(stratifiers.size()); + for ( int i = 0; i < stratifiers.size(); i++ ) { + final Object stratValue = stratifierValuesByKey.get(key).get(i); + states.add(stratValue); + } + return states; + } + + public List> getStratsAndStatesForKey(final int key) { + final List> states = new ArrayList>(stratifiers.size()); + for ( int i = 0; i < stratifiers.size(); i++ ) { + final K strat = stratifiers.get(i); + final Object stratValue = stratifierValuesByKey.get(key).get(i); + states.add(new Pair(strat, stratValue)); + } + return states; + } + + public String getStratsAndStatesStringForKey(final int key) { + if ( keyStrings.get(key) == null ) { + StringBuilder b = new StringBuilder(); + for ( int i = 0; i < stratifiers.size(); i++ ) { + final K strat = stratifiers.get(i); + final Object stratValue = stratifierValuesByKey.get(key).get(i); + b.append(strat.toString()).append(":").append(stratValue.toString()); + } + keyStrings.set(key, b.toString()); + } + + return keyStrings.get(key); + } + + // ------------------------------------------------------------------------------------- + // + // valuesByKey + // + // ------------------------------------------------------------------------------------- + + @Override + @Ensures("result != null") + public ArrayList values() { + return valuesByKey; + } + + public Collection values(List> states) { + // TODO -- SHOULD BE INLINE TO AVOID CREATING LIST OF KEYS JUST TO ITERATE OVER IT + Collection vals = new LinkedList(); + for ( int key : getKeys(states) ) + vals.add(get(key)); + return vals; + } + + @Requires("key >= 0 && key <= size()") + @Ensures("get(key) == value") + public void set(final int key, final V value) { + valuesByKey.set(key, value); + } + + @Requires("key >= 0 && key <= size()") + public V get(final int key) { + return valuesByKey.get(key); + } + + @Requires("getKey(states) != -1") + public V get(final List states) { + return get(getKey(states)); + } + + @Override + public V get(final Object o) { + return get((List)o); + } + + @Override + public boolean isEmpty() { + return false; + } + + public boolean containsKey(final List o) { + return getKey(o) != -1; + } + + @Override + public boolean containsKey(final Object o) { + return containsKey((List)o); + } + + @Override + public boolean containsValue(final Object o) { + throw new ReviewedStingException("containsValue() not implemented for StratificationManager"); + } + + @Override + public V put(final List objects, final V v) { + throw new ReviewedStingException("put() not implemented for StratificationManager"); + } + + @Override + public V remove(final Object o) { + throw new ReviewedStingException("remove() not implemented for StratificationManager"); + } + + @Override + public void putAll(final Map, ? extends V> map) { + throw new ReviewedStingException("clear() not implemented for StratificationManager"); + } + + @Override + public void clear() { + throw new ReviewedStingException("clear() not implemented for StratificationManager"); + } + + @Override + public Set> keySet() { + throw new ReviewedStingException("Not yet implemented"); + } + + @Override + public Set, V>> entrySet() { + throw new ReviewedStingException("Not yet implemented"); + } + + // ------------------------------------------------------------------------------------- + // + // utilities + // + // ------------------------------------------------------------------------------------- + + public static List> combineStates(final List first, final List second) { + final List> combined = new ArrayList>(first.size()); + for ( int i = 0; i < first.size(); i++ ) { + final Object firstI = first.get(i); + final Object secondI = second.get(i); + if ( firstI.equals(secondI) ) + combined.add(Collections.singletonList(firstI)); + else + combined.add(Arrays.asList(firstI, secondI)); + } + return combined; + } + + public interface Combiner { + /** take two values of type V and return a combined value of type V */ + public V combine(final V lhs, final V rhs); + } + + /** + * Remaps the stratifications from one stratification set to another, combining + * the values in V according to the combiner function. + * + * stratifierToReplace defines a set of states S1, while newStratifier defines + * a new set S2. remappedStates is a map from all of S1 into at least some of + * S2. This function creates a new, fully initialized manager where all of the + * data in this new manager is derived from the original data in this object + * combined according to the mapping remappedStates. When multiple + * elements of S1 can map to the same value in S2, these are sequentially + * combined by the function combiner. Suppose for example at states s1, s2, and + * s3 all map to N1. Eventually the value associated with state N1 would be + * + * value(N1) = combine(value(s1), combine(value(s2), value(s3)) + * + * in some order for s1, s2, and s3, which is not defined. Note that this function + * only supports combining one stratification at a time, but in principle a loop over + * stratifications and this function could do the multi-dimensional collapse. + * + * @param stratifierToReplace + * @param newStratifier + * @param combiner + * @param remappedStates + * @return + */ + public StratificationManager combineStrats(final K stratifierToReplace, + final K newStratifier, + final Combiner combiner, + final Map remappedStates) { + // make sure the mapping is reasonable + if ( ! newStratifier.getAllStates().containsAll(remappedStates.values()) ) + throw new ReviewedStingException("combineStrats: remapped states contains states not found in newStratifer state set"); + + if ( ! remappedStates.keySet().containsAll(stratifierToReplace.getAllStates()) ) + throw new ReviewedStingException("combineStrats: remapped states missing mapping for some states"); + + // the new strats are the old ones with the single replacement + final List newStrats = new ArrayList(getStratifiers()); + final int stratOffset = newStrats.indexOf(stratifierToReplace); + if ( stratOffset == -1 ) + throw new ReviewedStingException("Could not find strat to replace " + stratifierToReplace + " in existing strats " + newStrats); + newStrats.set(stratOffset, newStratifier); + + // create an empty but fully initialized new manager + final StratificationManager combined = new StratificationManager(newStrats); + + // for each key, get its state, update it according to the map, and update the combined manager + for ( int key = 0; key < size(); key++ ) { + // the new state is just the old one with the replacement + final List newStates = new ArrayList(getStatesForKey(key)); + final Object oldState = newStates.get(stratOffset); + final Object newState = remappedStates.get(oldState); + newStates.set(stratOffset, newState); + + // look up the new key given the new state + final int combinedKey = combined.getKey(newStates); + if ( combinedKey == -1 ) throw new ReviewedStingException("Couldn't find key for states: " + Utils.join(",", newStates)); + + // combine the old value with whatever new value is in combined already + final V combinedValue = combiner.combine(combined.get(combinedKey), get(key)); + + // update the value associated with combined key + combined.set(combinedKey, combinedValue); + } + + return combined; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java new file mode 100644 index 000000000..d77ef6eba --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/Stratifier.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager; + +import java.util.List; + +/** + * A basic interface for a class to be used with the StratificationManager system + * + * @author Mark DePristo + * @since 3/28/12 + */ +public interface Stratifier { + /** + * @return a list of all objects states that may be provided by this States provider + */ + public List getAllStates(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java index 2b37ce210..7f66aad39 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Analysis.java @@ -7,4 +7,5 @@ import java.lang.annotation.RetentionPolicy; public @interface Analysis { String name() default ""; // its description, required String description(); // its description, required + boolean molten() default false; // if true we'll look for a @Molten map } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java index db44e9e28..d4e9afd64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/AnalysisModuleScanner.java @@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.lang.reflect.Field; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -40,11 +41,15 @@ import java.util.Map; * the object, a Mashalling object can serialize or deserialize a analysis module. */ public class AnalysisModuleScanner { + final private static Map annotationCache = new HashMap(); // what we extracted from the class private Map datums = new LinkedHashMap(); // the data we've discovered private Analysis analysis; // the analysis annotation - + + private Field moltenField = null; + private Molten moltenAnnotation = null; + // private storage of the class type private final Class cls; @@ -83,12 +88,46 @@ public class AnalysisModuleScanner { private void scanFields() { // get the fields from the class, and extract for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) { - for (Field f : superCls.getDeclaredFields()) - for (Annotation annotation : f.getAnnotations()) { + for (Field f : superCls.getDeclaredFields()) { + for (Annotation annotation : getAnnotations(f)) { if (annotation.annotationType().equals(DataPoint.class)) datums.put(f,(DataPoint) annotation); + if ( annotation.annotationType().equals(Molten.class)) { + if ( hasMoltenField() ) + throw new ReviewedStingException("Analysis " + analysis.name() + " has multiple @Molten fields, which is forbidden"); + moltenField = f; + moltenAnnotation = (Molten)annotation; + } } + } } + + if ( hasMoltenField() ) { + if ( datums.size() > 0 ) + throw new ReviewedStingException("Analysis " + analysis.name() + " has an @Molten field as well as @DataPoint fields, which is forbidden"); + } + } + + public Field getMoltenField() { + return moltenField; + } + + public Molten getMoltenAnnotation() { + return moltenAnnotation; + } + + public boolean hasMoltenField() { + return getMoltenField() != null; + } + + private Annotation[] getAnnotations(final Field field) { + final String fieldName = field.toString(); + Annotation[] annotations = annotationCache.get(fieldName); + if ( annotations == null ) { + annotations = field.getAnnotations(); + annotationCache.put(fieldName, annotations); + } + return annotations; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java new file mode 100755 index 000000000..390682837 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/EvaluationContext.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.gatk.walkers.varianteval.util; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +public final class EvaluationContext { + // NOTE: must be hashset to avoid O(log n) cost of iteration in the very frequently called apply function + final VariantEvalWalker walker; + private final ArrayList evaluationInstances; + private final Set> evaluationClasses; + + public EvaluationContext(final VariantEvalWalker walker, final Set> evaluationClasses) { + this(walker, evaluationClasses, true); + } + + private EvaluationContext(final VariantEvalWalker walker, final Set> evaluationClasses, final boolean doInitialize) { + this.walker = walker; + this.evaluationClasses = evaluationClasses; + this.evaluationInstances = new ArrayList(evaluationClasses.size()); + + for ( final Class c : evaluationClasses ) { + try { + final VariantEvaluator eval = c.newInstance(); + if ( doInitialize ) eval.initialize(walker); + evaluationInstances.add(eval); + } catch (InstantiationException e) { + throw new ReviewedStingException("Unable to instantiate eval module '" + c.getSimpleName() + "'", e); + } catch (IllegalAccessException e) { + throw new ReviewedStingException("Illegal access error when trying to instantiate eval module '" + c.getSimpleName() + "'", e); + } + } + } + + /** + * Returns a sorted set of VariantEvaluators + * + * @return + */ + public final TreeSet getVariantEvaluators() { + return new TreeSet(evaluationInstances); + } + + public final void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { + for ( final VariantEvaluator evaluation : evaluationInstances ) { + // the other updateN methods don't see a null context + if ( tracker == null ) + continue; + + // now call the single or paired update function + switch ( evaluation.getComparisonOrder() ) { + case 1: + if (eval != null) { + evaluation.update1(eval, tracker, ref, context); + } + break; + case 2: + evaluation.update2(eval, comp, tracker, ref, context); + break; + default: + throw new ReviewedStingException("BUG: Unexpected evaluation order " + evaluation); + } + } + } + + public void combine(final EvaluationContext rhs) { + for ( int i = 0; i < evaluationInstances.size(); i++ ) + evaluationInstances.get(i).combine(rhs.evaluationInstances.get(i)); + } + + public final static EvaluationContextCombiner COMBINER = new EvaluationContext.EvaluationContextCombiner(); + private static class EvaluationContextCombiner implements StratificationManager.Combiner { + @Override + public EvaluationContext combine(EvaluationContext lhs, final EvaluationContext rhs) { + if ( lhs == null ) + lhs = new EvaluationContext(rhs.walker, rhs.evaluationClasses, false); + lhs.combine(rhs); + return lhs; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java new file mode 100755 index 000000000..1a14bfffb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/Molten.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.util; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Molten for @Analysis modules. + * + * If you are flagged as a molten analysis, then there must be one and + * only one annotation in that evaluation module: @Molten which + * must have time Map. This data set will then + * be represented in the VE output as: + * + * variable value + * key1 value1 + * key2 value1 + * ... + * keyN valueN + * + * in the output table. The names of these two fields can be override via annotation values. + */ +@Retention(RetentionPolicy.RUNTIME) +public @interface Molten { + String description() default ""; // the description, optional + + /** + * The name to use for the molten variable field in the output table. + * @return + */ + String variableName() default "variable"; + String variableFormat() default ""; + + /** + * The name to use for the molten value field in the output table. + * @return + */ + String valueName() default "value"; + String valueFormat() default ""; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java deleted file mode 100755 index c34e44516..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java +++ /dev/null @@ -1,86 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.util; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; -import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashMap; -import java.util.Set; -import java.util.TreeMap; - -public class NewEvaluationContext extends HashMap { - public TreeMap evaluationInstances; - - public String toString() { - String value = ""; - - for ( VariantStratifier key : this.keySet() ) { - value += "\t" + key.getName() + ":" + this.get(key) + "\n"; - } - - return value; - } - - public void addEvaluationClassList(VariantEvalWalker walker, StateKey stateKey, Set> evaluationClasses) { - evaluationInstances = new TreeMap(); - - for ( Class c : evaluationClasses ) { - try { - VariantEvaluator eval = c.newInstance(); - eval.initialize(walker); - - if (eval.stateIsApplicable(stateKey)) { - evaluationInstances.put(c.getSimpleName(), eval); - } - } catch (InstantiationException e) { - throw new StingException("Unable to instantiate eval module '" + c.getSimpleName() + "'"); - } catch (IllegalAccessException e) { - throw new StingException("Illegal access error when trying to instantiate eval module '" + c.getSimpleName() + "'"); - } - } - } - - public TreeMap getEvaluationClassList() { - return evaluationInstances; - } - - public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) { - for ( VariantEvaluator evaluation : evaluationInstances.values() ) { - // we always call update0 in case the evaluation tracks things like number of bases covered - - // the other updateN methods don't see a null context - if ( tracker == null ) - continue; - - // now call the single or paired update function - switch ( evaluation.getComparisonOrder() ) { - case 1: - if (eval != null) { - evaluation.update1(eval, tracker, ref, context); - } - - break; - case 2: - //if (eval != null) { - evaluation.update2(eval, comp, tracker, ref, context); - //} - - break; - default: - throw new ReviewedStingException("BUG: Unexpected evaluation order " + evaluation); - } - } - } - - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - for ( VariantEvaluator evaluation : evaluationInstances.values() ) { - evaluation.update0(tracker, ref, context); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java deleted file mode 100755 index 96bd9a9b7..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.util; - -import java.util.Map; -import java.util.TreeMap; - -public class StateKey extends TreeMap { -// public int hashCode() { -// int hashCode = 1; -// -// for (final Map.Entry pair : this.entrySet()) { -// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode(); -// } -// -// return hashCode; -// } - - public String toString() { - String value = ""; - - for ( final String key : this.keySet() ) { - //value += "\tstate " + key + ":" + this.get(key) + "\n"; - value += String.format("%s:%s;", key, this.get(key)); - } - - return value; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java deleted file mode 100644 index 7ffc3e2c8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.util; - - -/** - * - * @author aaron - * - * Class TableType - * - * an interface for turning arbritary objects into tables - */ -public interface TableType { - public Object[] getRowKeys(); - public Object[] getColumnKeys(); - public Object getCell(int x, int y); - public String getName(); -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index fdeb6919d..8a62bd032 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.varianteval.util; import org.apache.log4j.Logger; @@ -35,8 +59,8 @@ public class VariantEvalUtils { * List all of the available evaluation modules, then exit successfully */ public void listModulesAndExit() { - List> vsClasses = new PluginManager( VariantStratifier.class ).getPlugins(); - List> veClasses = new PluginManager( VariantEvaluator.class ).getPlugins(); + List> vsClasses = new PluginManager(VariantStratifier.class).getPlugins(); + List> veClasses = new PluginManager(VariantEvaluator.class).getPlugins(); logger.info("Available stratification modules:"); logger.info("(Standard modules are starred)"); @@ -58,12 +82,11 @@ public class VariantEvalUtils { /** * Initialize required, standard and user-specified stratification objects * - * @param variantEvalWalker the parent walker - * @param noStandardStrats don't use the standard stratifications - * @param modulesToUse the list of stratification modules to use + * @param noStandardStrats don't use the standard stratifications + * @param modulesToUse the list of stratification modules to use * @return set of stratifications to use */ - public TreeSet initializeStratificationObjects(VariantEvalWalker variantEvalWalker, boolean noStandardStrats, String[] modulesToUse) { + public List initializeStratificationObjects(boolean noStandardStrats, String[] modulesToUse) { TreeSet strats = new TreeSet(); Set stratsToUse = new HashSet(); @@ -115,7 +138,7 @@ public class VariantEvalUtils { } } - return strats; + return new ArrayList(strats); } /** @@ -157,108 +180,6 @@ public class VariantEvalUtils { return evals; } - /** - * Recursively initialize the evaluation contexts - * - * @param stratificationObjects the stratifications to use - * @param evaluationObjects the evaluations to use - * @param stratStack a stack of stratifications to apply - * @param ec evaluation context - * @return a map of all the evaluation contexts - */ - public HashMap initializeEvaluationContexts(Set stratificationObjects, Set> evaluationObjects, Stack stratStack, NewEvaluationContext ec) { - HashMap ecs = new HashMap(); - - if (stratStack == null) { - stratStack = new Stack(); - stratStack.addAll(stratificationObjects); - } - - if (!stratStack.isEmpty()) { - Stack newStratStack = new Stack(); - newStratStack.addAll(stratStack); - - VariantStratifier vs = newStratStack.pop(); - - for (String state : vs.getAllStates()) { - NewEvaluationContext nec = new NewEvaluationContext(); - if (ec != null) { - nec.putAll(ec); - } - nec.put(vs, state); - - ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec)); - } - } else { - HashMap necs = new HashMap(); - - StateKey stateKey = new StateKey(); - for (VariantStratifier vs : ec.keySet()) { - String state = ec.get(vs); - - stateKey.put(vs.getName(), state); - } - - ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects); - - necs.put(stateKey, ec); - - return necs; - } - - return ecs; - } - - /** - * Initialize the output report - * - * @param stratificationObjects the stratifications to use - * @param evaluationObjects the evaluations to use - * @return an initialized report object - */ - public GATKReport initializeGATKReport(Set stratificationObjects, Set> evaluationObjects) { - GATKReport report = new GATKReport(); - - for (Class ve : evaluationObjects) { - String tableName = ve.getSimpleName(); - String tableDesc = ve.getAnnotation(Analysis.class).description(); - - report.addTable(tableName, tableDesc); - - GATKReportTable table = report.getTable(tableName); - table.addPrimaryKey("entry", false); - table.addColumn(tableName, tableName); - - for (VariantStratifier vs : stratificationObjects) { - String columnName = vs.getName(); - - table.addColumn(columnName, "unknown"); - } - - try { - VariantEvaluator vei = ve.newInstance(); - vei.initialize(variantEvalWalker); - - AnalysisModuleScanner scanner = new AnalysisModuleScanner(vei); - Map datamap = scanner.getData(); - - for (Field field : datamap.keySet()) { - field.setAccessible(true); - - if (!(field.get(vei) instanceof TableType)) { - table.addColumn(field.getName(), 0.0, datamap.get(field).format()); - } - } - } catch (InstantiationException e) { - throw new StingException("InstantiationException: " + e); - } catch (IllegalAccessException e) { - throw new StingException("IllegalAccessException: " + e); - } - } - - return report; - } - /** * Subset a VariantContext to a single sample * @@ -285,7 +206,7 @@ public class VariantEvalUtils { final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) { - builder.attribute("ISSINGLETON", true); + builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true); } VariantContextUtils.calculateChromosomeCounts(builder, true); @@ -297,7 +218,6 @@ public class VariantEvalUtils { * Additional variant contexts per sample are automatically generated and added to the map unless the sample name * matches the ALL_SAMPLE_NAME constant. * - * * @param tracker the metadata tracker * @param ref the reference context * @param tracks the list of tracks to process @@ -306,57 +226,56 @@ public class VariantEvalUtils { * @param subsetBySample if false, do not separate the track into per-sample VCs * @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need * to do this) - * * @return the mapping of track to VC list that should be populated */ public HashMap, HashMap>> - bindVariantContexts(RefMetaDataTracker tracker, - ReferenceContext ref, - List> tracks, - boolean byFilter, - boolean subsetBySample, - boolean trackPerSample, - boolean mergeTracks) { - if ( tracker == null ) + bindVariantContexts(RefMetaDataTracker tracker, + ReferenceContext ref, + List> tracks, + boolean byFilter, + boolean subsetBySample, + boolean trackPerSample, + boolean mergeTracks) { + if (tracker == null) return null; HashMap, HashMap>> bindings = new HashMap, HashMap>>(); RodBinding firstTrack = tracks.isEmpty() ? null : tracks.get(0); - for ( RodBinding track : tracks ) { + for (RodBinding track : tracks) { HashMap> mapping = new HashMap>(); - for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) { + for (VariantContext vc : tracker.getValues(track, ref.getLocus())) { // First, filter the VariantContext to represent only the samples for evaluation VariantContext vcsub = vc; - if ( subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation()) ) { + if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) { vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); } - if ( (byFilter || !vcsub.isFiltered()) ) { + if ((byFilter || !vcsub.isFiltered())) { addMapping(mapping, VariantEvalWalker.getAllSampleName(), vcsub); } // Now, if stratifying, split the subsetted vc per sample and add each as a new context - if ( vc.hasGenotypes() && trackPerSample ) { - for ( String sampleName : variantEvalWalker.getSampleNamesForEvaluation() ) { + if (vc.hasGenotypes() && trackPerSample) { + for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) { VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName); - if ( byFilter || !samplevc.isFiltered() ) { + if (byFilter || !samplevc.isFiltered()) { addMapping(mapping, sampleName, samplevc); } } } } - if ( mergeTracks && bindings.containsKey(firstTrack) ) { + if (mergeTracks && bindings.containsKey(firstTrack)) { // go through each binding of sample -> value and add all of the bindings from this entry HashMap> firstMapping = bindings.get(firstTrack); - for ( Map.Entry> elt : mapping.entrySet() ) { + for (Map.Entry> elt : mapping.entrySet()) { Collection firstMappingSet = firstMapping.get(elt.getKey()); - if ( firstMappingSet != null ) { + if (firstMappingSet != null) { firstMappingSet.addAll(elt.getValue()); } else { firstMapping.put(elt.getKey(), elt.getValue()); @@ -371,54 +290,8 @@ public class VariantEvalUtils { } private void addMapping(HashMap> mappings, String sample, VariantContext vc) { - if ( !mappings.containsKey(sample) ) + if (!mappings.containsKey(sample)) mappings.put(sample, new ArrayList(1)); mappings.get(sample).add(vc); } - - /** - * Recursively initialize the state keys used to look up the right evaluation context based on the state of the - * variant context - * - * @param stateMap the map of allowable states - * @param stateStack a stack of the states - * @param stateKey a state key object - * @param stateKeys all the state keys - * @return a list of state keys - */ - public ArrayList initializeStateKeys(HashMap> stateMap, Stack>> stateStack, StateKey stateKey, ArrayList stateKeys) { - if (stateStack == null) { - stateStack = new Stack>>(); - - for (VariantStratifier vs : stateMap.keySet()) { - HashMap> oneSetOfStates = new HashMap>(); - oneSetOfStates.put(vs, stateMap.get(vs)); - - stateStack.add(oneSetOfStates); - } - } - - if (!stateStack.isEmpty()) { - Stack>> newStateStack = new Stack>>(); - newStateStack.addAll(stateStack); - - HashMap> oneSetOfStates = newStateStack.pop(); - VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); - - for (String state : oneSetOfStates.get(vs)) { - StateKey newStateKey = new StateKey(); - if (stateKey != null) { - newStateKey.putAll(stateKey); - } - - newStateKey.put(vs.getName(), state); - - initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys); - } - } else { - stateKeys.add(stateKey); - } - - return stateKeys; - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 6b36f4e1b..5b1d69f14 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -35,16 +35,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import java.io.File; -import java.io.FileNotFoundException; import java.util.*; /** @@ -86,8 +84,8 @@ import java.util.*; * */ -@PartitionBy(PartitionType.NONE) -public class ApplyRecalibration extends RodWalker { +@PartitionBy(PartitionType.LOCUS) +public class ApplyRecalibration extends RodWalker implements TreeReducible { ///////////////////////////// // Inputs @@ -98,9 +96,9 @@ public class ApplyRecalibration extends RodWalker { @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) public List> input; @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) - private File RECAL_FILE; + protected RodBinding recal; @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) - private File TRANCHES_FILE; + protected File TRANCHES_FILE; ///////////////////////////// // Outputs @@ -112,7 +110,7 @@ public class ApplyRecalibration extends RodWalker { // Command Line Arguments ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) - private double TS_FILTER_LEVEL = 99.0; + protected double TS_FILTER_LEVEL = 99.0; @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) @@ -123,8 +121,6 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// final private List tranches = new ArrayList(); final private Set inputNames = new HashSet(); - final private NestedHashMap lodMap = new NestedHashMap(); - final private NestedHashMap annotationMap = new NestedHashMap(); final private Set ignoreInputFilterSet = new TreeSet(); //--------------------------------------------------------------------------------------------------------------- @@ -174,20 +170,6 @@ public class ApplyRecalibration extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); - - try { - logger.info("Reading in recalibration table..."); - for ( final String line : new XReadLines( RECAL_FILE ) ) { - final String[] vals = line.split(","); - lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys - annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, e); - } catch ( Exception e ) { - throw new UserException.MalformedFile(RECAL_FILE, "Could not parse LOD and annotation information in input recal file. File is somehow malformed."); - } - } //--------------------------------------------------------------------------------------------------------------- @@ -202,52 +184,75 @@ public class ApplyRecalibration extends RodWalker { return 1; } - for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) { - if( vc != null ) { - if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - String filterString = null; + final List VCs = tracker.getValues(input, context.getLocation()); + final List recals = tracker.getValues(recal, context.getLocation()); - final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); - final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); - if( lod == null ) { - throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); - } + for( final VariantContext vc : VCs ) { - // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { - for( int i = tranches.size() - 1; i >= 0; i-- ) { - final Tranche tranche = tranches.get(i); - if( lod >= tranche.minVQSLod ) { - if( i == tranches.size() - 1 ) { - filterString = VCFConstants.PASSES_FILTERS_v4; - } else { - filterString = tranche.name; - } - break; - } - } - - if( filterString == null ) { - filterString = tranches.get(0).name+"+"; - } - - if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { - builder.filters(filterString); - } - - vcfWriter.add( builder.make() ); - } else { // valid VC but not compatible with this mode, so just emit the variant untouched - vcfWriter.add( vc ); + final VariantContext recalDatum = getMatchingRecalVC(vc, recals); + if( recalDatum == null ) { + throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } + + final String lodString = recalDatum.getAttributeAsString(VariantRecalibrator.VQS_LOD_KEY, null); + if( lodString == null ) { + throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); + } + final double lod; + try { + lod = Double.valueOf(lodString); + } catch (NumberFormatException e) { + throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc ); + } + + VariantContextBuilder builder = new VariantContextBuilder(vc); + String filterString = null; + + // Annotate the new record with its VQSLOD and the worst performing annotation + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); + + for( int i = tranches.size() - 1; i >= 0; i-- ) { + final Tranche tranche = tranches.get(i); + if( lod >= tranche.minVQSLod ) { + if( i == tranches.size() - 1 ) { + filterString = VCFConstants.PASSES_FILTERS_v4; + } else { + filterString = tranche.name; + } + break; + } + } + + if( filterString == null ) { + filterString = tranches.get(0).name+"+"; + } + + if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { + builder.filters(filterString); + } + + vcfWriter.add( builder.make() ); + } else { // valid VC but not compatible with this mode, so just emit the variant untouched + vcfWriter.add( vc ); } } return 1; // This value isn't used for anything } + private static VariantContext getMatchingRecalVC(final VariantContext target, final List recalVCs) { + for( final VariantContext recalVC : recalVCs ) { + if ( target.getEnd() == recalVC.getEnd() ) { + return recalVC; + } + } + + return null; + } + //--------------------------------------------------------------------------------------------------------------- // // reduce @@ -262,6 +267,10 @@ public class ApplyRecalibration extends RodWalker { return 1; // This value isn't used for anything } + public Integer treeReduce( final Integer lhs, final Integer rhs ) { + return 1; // This value isn't used for anything + } + public void onTraversalDone( final Integer reduceSum ) { } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a957bfd85..3778cffb8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -30,14 +30,16 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -239,14 +241,6 @@ public class VariantDataManager { value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) { - // normalize QD by event length for indel case - int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now - if (eventLength > 0) { // sanity check - value /= (double)eventLength; - } - } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { @@ -285,11 +279,28 @@ public class VariantDataManager { (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } - public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { + public void writeOutRecalibrationTable( final VCFWriter recalWriter ) { + // we need to sort in coordinate order in order to produce a valid VCF + Collections.sort( data, new Comparator() { + public int compare(VariantDatum vd1, VariantDatum vd2) { + return vd1.loc.compareTo(vd2.loc); + }} ); + + // create dummy alleles to be used + final List alleles = new ArrayList(2); + alleles.add(Allele.create("N", true)); + alleles.add(Allele.create("", false)); + + // to be used for the important INFO tags + final HashMap attributes = new HashMap(3); + for( final VariantDatum datum : data ) { - RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", - datum.contig, datum.start, datum.stop, datum.lod, - (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); + attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); + attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStart(), alleles).attributes(attributes); + recalWriter.add(builder.make()); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index eb9e98fcb..32350f0fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -46,9 +48,7 @@ public class VariantDatum implements Comparable { public double originalQual; public double prior; public int consensusCount; - public String contig; - public int start; - public int stop; + public GenomeLoc loc; public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 3cdcf4982..f86908dbe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -37,6 +37,8 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; @@ -136,9 +138,11 @@ public class VariantRecalibrator extends RodWalkeremptySet() ); + recalWriter = new StandardVCFWriter(recalFile, getMasterSequenceDictionary(), false); + recalWriter.writeHeader(vcfHeader); } //--------------------------------------------------------------------------------------------------------------- @@ -246,9 +254,7 @@ public class VariantRecalibrator extends RodWalker { @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) public int minimumN = 1; + /** + * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs. + */ + @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false) + public boolean SUPPRESS_COMMAND_LINE_HEADER = false; + @Hidden @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; @@ -183,7 +189,9 @@ public class CombineVariants extends RodWalker { Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); if ( SET_KEY != null ) headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); - vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples)); + VCFHeader vcfHeader = new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples); + vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER); + vcfWriter.writeHeader(vcfHeader); if ( vcfWriter instanceof VCFWriterStub) { sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java new file mode 100755 index 000000000..714fb938e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.apache.commons.io.FilenameUtils; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.text.ListFileUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.util.*; + +/** + * Selects headers from a VCF source. + *

+ *

+ * Often, a VCF containing many headers will need to be subset in order to facilitate certain formatting guidelines. + * SelectHeaders can be used for this purpose. Given a single VCF file, one or more headers can be extracted from the + * file (based on a complete header name or a pattern match). + *

+ *

Input

+ *

+ * A set of VCFs. + *

+ *

+ *

Output

+ *

+ * A header selected VCF. + *

+ *

+ *

Examples

+ *
+ * Select only the FILTER, FORMAT, and INFO headers:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO
+ *
+ * Select only the FILTER, FORMAT, and INFO headers and add in the reference file names:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO \
+ *   -irn \
+ *   -iln
+ *
+ * Select only the FILTER, FORMAT, and INFO headers, plus any headers with SnpEff:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectHeaders \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -hn FILTER \
+ *   -hn FORMAT \
+ *   -hn INFO \
+ *   -he '.*SnpEff.*'
+ * 
+ */ +@SuppressWarnings("unused") +public class SelectHeaders extends RodWalker implements TreeReducible { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Output(doc = "File to which variants should be written", required = true) + protected VCFWriter vcfWriter; + + @Argument(fullName = "header_name", shortName = "hn", doc = "Include header. Can be specified multiple times", required = false) + public Set headerNames; + + @Argument(fullName = "header_expression", shortName = "he", doc = "Regular expression to select many headers from the tracks provided. Can be specified multiple times", required = false) + public Set headerExpressions; + + /** + * Note that header exclusion takes precedence over inclusion, so that if a header is in both lists it will be excluded. + */ + @Argument(fullName = "exclude_header_name", shortName = "xl_hn", doc = "Exclude header. Can be specified multiple times", required = false) + public Set XLheaderNames; + + /** + * Note that reference inclusion takes precedence over other header matching. If set other reference lines may be excluded but the file name will still be added. + */ + @Argument(fullName = "include_reference_name", shortName = "irn", doc = "If set the reference file name minus the file extension will be added to the headers", required = false) + public boolean includeReference; + + /** + * Note that interval name inclusion takes precedence over other header matching. If set other interval lines may be excluded but the intervals will still be added. + */ + @Argument(fullName = "include_interval_names", shortName = "iln", doc = "If set the interval file name minus the file extension, or the command line intervals, will be added to the headers", required = false) + public boolean includeIntervals; + + /** + * Note that engine header inclusion takes precedence over other header matching. If set other engine lines may be excluded but the intervals will still be added. + */ + @Hidden // TODO: Determine if others find this valuable and either remove @Hidden or remove -ieh. + @Argument(fullName = "include_engine_headers", shortName = "ieh", doc = "If set the headers normally output by the engine will be added to the headers", required = false) + public boolean includeEngineHeaders; + + private static final ListFileUtils.StringConverter headerKey = new ListFileUtils.StringConverter() { + @Override + public String convert(VCFHeaderLine value) { + return value.getKey(); + } + }; + + /** + * Set up the VCF writer, the header expressions and regexps + */ + @Override + public void initialize() { + // Get list of samples to include in the output + List rodNames = Arrays.asList(variantCollection.variants.getName()); + + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + + headerLines.add(new VCFHeaderLine(VCFHeader.SOURCE_KEY, "SelectHeaders")); + + // Select only the headers requested by name or expression. + headerLines = new LinkedHashSet(getSelectedHeaders(headerLines)); + + // Optionally add in the reference. + if (includeReference && getToolkit().getArguments().referenceFile != null) + headerLines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, FilenameUtils.getBaseName(getToolkit().getArguments().referenceFile.getName()))); + + // Optionally add in the intervals. + if (includeIntervals && getToolkit().getArguments().intervals != null) { + for (IntervalBinding intervalBinding : getToolkit().getArguments().intervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + } + } + } + + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + VCFHeader vcfHeader = new VCFHeader(headerLines, vcfSamples); + vcfHeader.setWriteEngineHeaders(includeEngineHeaders); + vcfWriter.writeHeader(vcfHeader); + } + + private Set getSelectedHeaders(Set headerLines) { + Set selectedHeaders = new TreeSet(); + if (headerNames == null && headerExpressions == null) { + // Include everything if nothing was explicitly included. + selectedHeaders.addAll(headerLines); + } else { + // Only include the selected headers. + if (headerNames != null) + selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerNames, true)); + if (headerExpressions != null) + selectedHeaders.addAll(ListFileUtils.includeMatching(headerLines, headerKey, headerExpressions, false)); + } + + // Remove any excluded headers. + if (XLheaderNames != null) + selectedHeaders = ListFileUtils.excludeMatching(selectedHeaders, headerKey, XLheaderNames, true); + return selectedHeaders; + } + + /** + * Pass through the VC record + * + * @param tracker the ROD tracker + * @param ref reference information + * @param context alignment info + * @return number of records processed + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int count = 0; + if (tracker != null) { + Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + if (vcs != null) { + for (VariantContext vc : vcs) { + vcfWriter.add(vc); + count++; + } + } + } + return count; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + @Override + public void onTraversalDone(Integer result) { + logger.info(result + " records processed."); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 204851e1f..42a40cde5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -189,7 +189,7 @@ public class SelectVariants extends RodWalker implements TreeR * or the sample is called reference in this track. */ @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false) - private RodBinding discordanceTrack; + protected RodBinding discordanceTrack; /** * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called @@ -197,7 +197,7 @@ public class SelectVariants extends RodWalker implements TreeR * concordance track and they have the sample genotype call. */ @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) - private RodBinding concordanceTrack; + protected RodBinding concordanceTrack; @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; @@ -230,10 +230,10 @@ public class SelectVariants extends RodWalker implements TreeR public ArrayList SELECT_EXPRESSIONS = new ArrayList(); @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) - private boolean EXCLUDE_NON_VARIANTS = false; + protected boolean EXCLUDE_NON_VARIANTS = false; @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) - private boolean EXCLUDE_FILTERED = false; + protected boolean EXCLUDE_FILTERED = false; /** @@ -257,23 +257,23 @@ public class SelectVariants extends RodWalker implements TreeR private Boolean MENDELIAN_VIOLATIONS = false; @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) - private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; + protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; /** * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. */ @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) - private int numRandom = 0; + protected int numRandom = 0; /** * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. */ @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) - private double fractionRandom = 0; + protected double fractionRandom = 0; @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) - private double fractionGenotypes = 0; + protected double fractionGenotypes = 0; /** * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. @@ -508,7 +508,7 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; - VariantContext sub = subsetRecord(vc, samples); + VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS); if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { boolean failedJexlMatch = false; for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { @@ -645,11 +645,15 @@ public class SelectVariants extends RodWalker implements TreeR * @param samples the samples to extract * @return the subsetted VariantContext */ - private VariantContext subsetRecord(VariantContext vc, Set samples) { + private VariantContext subsetRecord(final VariantContext vc, final Set samples, final boolean excludeNonVariants) { if ( samples == null || samples.isEmpty() ) return vc; - final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); + final VariantContext sub; + if ( excludeNonVariants ) + sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used + else + sub = vc.subContextFromSamples(samples, vc.getAlleles()); VariantContextBuilder builder = new VariantContextBuilder(sub); GenotypesContext newGC = sub.getGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java similarity index 51% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index d8b01e91d..aaf3bb5cd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.R.RScriptExecutorException; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; @@ -15,26 +16,26 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** * Yet another VCF to Ped converter. The world actually does need one that will * work efficiently on large VCFs (or at least give a progress bar). This - * produces a binary ped file in SNP-major mode. + * produces a binary ped file in individual major mode. */ -public class VariantsToPed extends RodWalker { +public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)") + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + + "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") @@ -49,6 +50,9 @@ public class VariantsToPed extends RodWalker { @Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL") int minGenotypeQuality = 0; + @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") + boolean majorAlleleFirst = false; + private ValidateVariants vv = new ValidateVariants(); private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -58,31 +62,48 @@ public class VariantsToPed extends RodWalker { private static final byte HET = 0x2; private static final byte NO_CALL = 0x1; - // note that HET and NO_CALL are flippd from the documentation: that's because + private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples + + // note that HET and NO_CALL are flipped from the documentation: that's because // plink actually reads these in backwards; and we want to use a shift operator // to put these in the appropriate location + private Map printMap = new HashMap(); + private Map tempFiles = new HashMap(); + private Map genotypeBuffer = new HashMap(); + private int genotypeCount = 0; + private int byteCount = 0; + private List famOrder = new ArrayList(); + public void initialize() { vv.variantCollection = variantCollection; vv.dbsnp = dbsnp; vv.DO_NOT_VALIDATE_FILTERED = true; vv.type = ValidateVariants.ValidationType.REF; + // create temporary output streams and buffers + // write magic bits into the ped file try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 }); + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode } catch (IOException e) { throw new ReviewedStingException("error writing to output file."); } // write to the fam file, the first six columns of the standard ped file // first, load data from the input meta data file Map> metaValues = new HashMap>(); + Set samplesToUse = new HashSet(); + logger.debug("Reading in metadata..."); try { if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\t"); + String sid = famSplit[1]; outFam.printf("%s%n",line); } } else { for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); String[] split = line.split("\\t"); String sampleID = split[0]; String keyVals = split[1]; @@ -119,6 +140,15 @@ public class VariantsToPed extends RodWalker { String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; String pheno = mVals.get("phenotype"); outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); + try { + File temp = File.createTempFile(sample, ".tmp"); + printMap.put(sample,new PrintStream(temp)); + tempFiles.put(sample,temp); + } catch (IOException e) { + throw new ReviewedStingException("Error creating temporary file",e); + } + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); + famOrder.add(sample); } } } @@ -138,32 +168,57 @@ public class VariantsToPed extends RodWalker { } VariantContext vc = tracker.getFirstValue(variantCollection.variants); + String refOut; + String altOut; + boolean altMajor; + if ( majorAlleleFirst ) { + // want to use the major allele as ref + HashMap ats = new HashMap(vc.getAttributes()); + if ( ! vc.hasAttribute("AF") ) { + VariantContextUtils.calculateChromosomeCounts(vc,ats,true); + } + if ( getAF(ats.get("AF")) > 0.5 ) { + refOut = vc.getAlternateAllele(0).getBaseString(); + altOut = vc.getReference().getBaseString(); + altMajor = true; + } else { + refOut = vc.getReference().getBaseString(); + altOut = vc.getAlternateAllele(0).getBaseString(); + altMajor = false; + } + } else { + refOut = vc.getReference().getBaseString(); + altOut = vc.getAlternateAllele(0).getBaseString(); + altMajor = false; + } // write an entry into the map file outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), - vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString()); - // write an entry into the bed file - int buf = 0; - int idx = 0; - byte out = 0x0; - byte[] toWrite = new byte[1+(vc.getNSamples()/4)]; - for (Genotype g : vc.getGenotypes() ) { - out |= getEncoding(g,buf); - if ( buf == 3 ) { - toWrite[idx] = out; - buf = 0; - out = 0x0; - idx++; - } else { - buf++; + refOut,altOut); + // store genotypes per sample into the buffer + for ( Genotype g : vc.getGenotypes() ) { + String sample = g.getSampleName(); + byte[] samBuf = genotypeBuffer.get(sample); + byte enc = getEncoding(g,genotypeCount,altMajor); + samBuf[byteCount] |= enc; + } + genotypeCount++; + if ( genotypeCount % 4 == 0 ) { + byteCount++; + if ( byteCount >= BUFFER_SIZE ) { + // dump the buffer to the print streams + for ( String sample : printMap.keySet() ) { + OutputStream samOut = printMap.get(sample); + // print the buffer for this sample + try { + samOut.write(genotypeBuffer.get(sample)); + } catch ( IOException e ) { + throw new ReviewedStingException("Error writing to temporary bed file.",e); + } + // reset the buffer for this sample + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); + } } - } - if ( out != 0x0 ) { - toWrite[idx]=out; - } - try { - outBed.write(toWrite); - } catch (IOException e) { - throw new ReviewedStingException("Error writing to output file"); + genotypeCount = 0; } return 1; @@ -177,7 +232,61 @@ public class VariantsToPed extends RodWalker { return 0; } - private byte getEncoding(Genotype g, int offset) { + public void onTraversalDone(Integer numSites) { + logger.info(String.format("%d sites processed!",numSites)); + // push out the remaining genotypes and close stream + for ( String sample : printMap.keySet() ) { + try { + int lim = byteCount + (genotypeCount > 0 ? 1 : 0); + printMap.get(sample).write(genotypeBuffer.get(sample),0,lim); + } catch (IOException e) { + throw new ReviewedStingException("Error closing temporary file.",e); + } + + try { + printMap.get(sample).close(); + } catch (IOException e) { + throw new ReviewedStingException("Error closing temporary file.",e); + } + } + for ( String sample : famOrder ) { + logger.info("Merging genotypes for "+sample); + FileInputStream inStream; + try { + inStream = new FileInputStream(tempFiles.get(sample)); + } catch (IOException e) { + throw new ReviewedStingException("Error opening temp file for input.",e); + } + + + try { + int ttr = numSites/4 + (genotypeCount > 0 ? 1 : 0); + for ( ; ttr > BUFFER_SIZE ; ttr -= BUFFER_SIZE ) { + byte[] readGenotypes = new byte[BUFFER_SIZE]; + inStream.read(readGenotypes); + outBed.write(readGenotypes); + } + if ( ttr > 0 ) { + byte[] readGenotypes = new byte[ttr]; + inStream.read(readGenotypes); + outBed.write(readGenotypes); + } + } catch (IOException e) { + throw new ReviewedStingException("Error reading form temp file for input.",e); + } + } + + } + + private byte getEncoding(Genotype g, int offset, boolean altMajor) { + if ( ! altMajor ) { + return getStandardEncoding(g,offset); + } + + return getFlippedEncoding(g,offset); + } + + private byte getStandardEncoding(Genotype g, int offset) { byte b; if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { b = NO_CALL; @@ -194,11 +303,38 @@ public class VariantsToPed extends RodWalker { return (byte) (b << (2*offset)); } + private byte getFlippedEncoding(Genotype g, int offset) { + byte b; + if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { + b = NO_CALL; + } else if ( g.isHomRef() ) { + b = HOM_VAR; + } else if ( g.isHomVar() ) { + b = HOM_REF; + } else if ( g.isHet() ) { + b = HET; + } else { + b = NO_CALL; + } + + return (byte) (b << (2*offset)); + } + private static String getID(VariantContext v) { if ( v.hasID() ) { return v.getID(); } else { - return String.format("SNP-%s-%d",v.getChr(),v.getStart()); + return String.format("Var-%s-%d",v.getChr(),v.getStart()); + } + } + + private double getAF(Object o) { + if ( (o instanceof String) ) { + return Double.parseDouble((String) o); + } else if ( (o instanceof Double) ) { + return (Double) o; + } else { + throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 4c8e8df5c..46a3ba39c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -95,8 +95,13 @@ import java.util.*; * @since 2010 */ public class VariantsToTable extends RodWalker { - @ArgumentCollection - protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + /** + * Variants from this VCF file are used by this tool as input. + * The file must at least contain the standard VCF header lines, but + * can be empty (i.e., no variants are contained in the file). + */ + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variants; @Output(doc="File to which results should be written",required=true) protected PrintStream out; @@ -155,7 +160,7 @@ public class VariantsToTable extends RodWalker { if ( tracker == null ) // RodWalkers can make funky map calls return 0; - for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { + for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) { if ( showFiltered || vc.isNotFiltered() ) { for ( final List record : extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, splitMultiAllelic) ) out.println(Utils.join("\t", record)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index f5928b723..05865b587 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -216,12 +216,12 @@ public class VariantsToVCF extends RodWalker { Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName()))); //hInfo.add(new VCFHeaderLine("source", "VariantsToVCF")); - //hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + //hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getID())); allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY); for ( VCFHeaderLine field : hInfo ) { if ( field instanceof VCFFormatHeaderLine) { - allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName()); + allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getID()); } } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index a3f80af1c..dcdef5aab 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -194,6 +194,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private static final List gatkPackages = Arrays.asList( "org.broadinstitute.sting.gatk", + "org.broadinstitute.sting.pipeline", "org.broadinstitute.sting.analyzecovariates", "org.broadinstitute.sting.gatk.datasources.reads.utilities"); @@ -251,7 +252,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private void writeFilter(String className, List argumentFields, Set> dependents) throws IOException { String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction", - className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents); + className, "", false, String.format(" + \" --read_filter %s\"", className), argumentFields, dependents); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java new file mode 100644 index 000000000..6d3493211 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java @@ -0,0 +1,284 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.ByteArrayOutputStream; +import java.io.ObjectOutputStream; +import java.util.BitSet; + +/** + * Utilities for bitset conversion + * + * @author Mauricio Carneiro + * @since 3/5/12 + */ +public class BitSetUtils { + + static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion. + static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers) + static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers) + static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length + + /** + * Creates an long out of a bitset + * + * @param bitSet the bitset + * @return a long from the bitset representation + */ + public static long longFrom(final BitSet bitSet) { + return longFrom(bitSet, NBITS_LONG_REPRESENTATION); + } + + /** + * Creates a short integer from a bitset + * + * @param bitSet the bitset + * @return a short from the bitset representation + */ + public static short shortFrom(final BitSet bitSet) { + return (short) longFrom(bitSet, NBITS_SHORT_REPRESENTATION); + } + + /** + * Cretes an integer with any number of bits (up to 64 -- long precision) from a bitset + * + * @param bitSet the bitset + * @param nBits the number of bits to be used for this representation + * @return an integer with nBits from the bitset representation + */ + public static long longFrom(final BitSet bitSet, final int nBits) { + long number = 0; + for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0 && bitIndex <= nBits; bitIndex = bitSet.nextSetBit(bitIndex + 1)) + number |= 1L << bitIndex; + + return number; + } + + /** + * Creates a BitSet representation of a given long + * + * @param number the number to turn into a bitset + * @return a bitset representation of the long + */ + public static BitSet bitSetFrom(long number) { + return bitSetFrom(number, NBITS_LONG_REPRESENTATION); + } + + /** + * Creates a BitSet representation of a given short + * + * @param number the number to turn into a bitset + * @return a bitset representation of the short + */ + public static BitSet bitSetFrom(short number) { + return bitSetFrom(number, NBITS_SHORT_REPRESENTATION); + } + + /** + * Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision) + * + * @param number the number to turn into a bitset + * @param nBits the number of bits to use as precision for this conversion + * @return a bitset representation of the integer + */ + public static BitSet bitSetFrom(long number, int nBits) { + BitSet bitSet = new BitSet(); + boolean isNegative = number < 0; + int bitIndex = 0; + while (number != 0) { + if (number % 2 != 0) + bitSet.set(bitIndex); + bitIndex++; + number /= 2; + } + if (isNegative) { + boolean foundFirstSetBit = false; + for (int i = bitSet.nextSetBit(0); i < nBits && i >= 0; i++) { + boolean bit = bitSet.get(i); + if (!foundFirstSetBit && bit) + foundFirstSetBit = true; // maintain all bits until the first 1 is found (inclusive) + else if (foundFirstSetBit) + bitSet.flip(i); // flip every other bit up to NBITS_REPRESENTATION + } + } + return bitSet; + } + + /** + * Converts a BitSet into the dna string representation. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the + * base_10 representation of the sequence. This is important for us to know how to bring the number + * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented + * as 0's and leading 0's are omitted). + * + * quasi-canonical because A is represented by a 0, therefore, + * instead of : 0, 1, 2, 3, 10, 11, 12, ... + * we have : 0, 1, 2, 3, 00, 01, 02, ... + * + * but we can correctly decode it because we know the final length. + * + * @param bitSet the bitset representation of the dna sequence + * @return the dna sequence represented by the bitset + */ + public static String dnaFrom(final BitSet bitSet) { + long number = longFrom(bitSet); // the base_10 representation of the bit set + if (number < 0) + throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); + + int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) + number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + + String dna = ""; + while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) + byte base = (byte) (number % 4); + switch (base) { + case 0: + dna = "A" + dna; + break; + case 1: + dna = "C" + dna; + break; + case 2: + dna = "G" + dna; + break; + case 3: + dna = "T" + dna; + break; + } + number /= 4; + } + for (int j = dna.length(); j < length; j++) + dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) + + return dna; + } + + /** + * Creates a BitSet representation of a given dna string. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * The bit representation of a dna string is the simple: + * 0 A 4 AA 8 CA + * 1 C 5 AC ... + * 2 G 6 AG 1343 TTGGT + * 3 T 7 AT 1364 TTTTT + * + * To convert from dna to number, we convert the dna string to base10 and add all combinations that + * preceded the string (with smaller lengths). + * + * @param dna the dna sequence + * @return the bitset representing the dna sequence + */ + public static BitSet bitSetFrom(String dna) { + if (dna.length() > MAX_DNA_CONTEXT) + throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length())); + + long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set + long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string + for (int i = 0; i < dna.length(); i++) { + baseTen *= 4; + switch (dna.charAt(i)) { + case 'A': + baseTen += 0; + break; + case 'C': + baseTen += 1; + break; + case 'G': + baseTen += 2; + break; + case 'T': + baseTen += 3; + break; + } + } + return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. + } + + /** + * Calculates the number of bits necessary to represent a given number of elements + * + * @param numberOfElements the number of elements to represent (must be positive) + * @return the number of bits necessary to represent this many elements + */ + public static int numberOfBitsToRepresent(long numberOfElements) { + if (numberOfElements < 0) + throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements); + + if (numberOfElements == 1L) + return 1; // special case + + int n = 0; + numberOfElements--; + while (numberOfElements > 0) { + numberOfElements = numberOfElements >> 1; + n++; + } + return n; + } + + /** + * Calculates the length of the DNA context for a given base 10 number + * + * It is important to know the length given the base 10 number to calculate the number of combinations + * and to disambiguate the "quasi-canonical" state. + * + * This method also calculates the number of combinations as a by-product, but since it memoizes the + * results, a subsequent call to combinationsFor(length) is O(1). + * + * @param number the base 10 representation of the bitset + * @return the length of the DNA context represented by this number + */ + private static int contextLengthFor(long number) { + int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. + long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it). + while (combinations <= number) { // find the length of the dna string (length) + length++; + combinations = combinationsFor(length); // calculate the next context + } + return length; + } + + /** + * The sum of all combinations of a context of a given length from length = 0 to length. + * + * Memoized implementation of sum(4^i) , where i=[0,length] + * + * @param length the length of the DNA context + * @return the sum of all combinations leading up to this context length. + */ + private static long combinationsFor(int length) { + if (length > MAX_DNA_CONTEXT) + throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length)); + + // only calculate the number of combinations if the table hasn't already cached the value + if (length > 0 && combinationsPerLength[length] == 0) { + long combinations = 0L; + for (int i = 1; i <= length; i++) + combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) ) + combinationsPerLength[length] = combinations; + } + return combinationsPerLength[length]; + } + + + public static byte[] sizeOf(Object obj) throws java.io.IOException + { + ByteArrayOutputStream byteObject = new ByteArrayOutputStream(); + ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject); + objectOutputStream.writeObject(obj); + objectOutputStream.flush(); + objectOutputStream.close(); + byteObject.close(); + + return byteObject.toByteArray(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 085794bab..03c7d279b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -24,24 +24,26 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; public class Haplotype { protected final byte[] bases; protected final double[] quals; private GenomeLoc genomeLocation = null; - private boolean isReference = false; + private HashMap readLikelihoodsPerSample = null; + private HashMap eventMap = null; + private boolean isRef = false; + private Cigar cigar; + private int alignmentStartHapwrtRef; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -69,15 +71,47 @@ public class Haplotype { this.genomeLocation = loc; } - public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) { - this(bases, loc); - this.isReference = isRef; - } - @Override public boolean equals( Object h ) { return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases); } + + @Override + public int hashCode() { + return Arrays.hashCode(bases); + } + + public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) { + if( readLikelihoodsPerSample == null ) { + readLikelihoodsPerSample = new HashMap(); + } + readLikelihoodsPerSample.put(sample, readLikelihoods); + } + + @Ensures({"result != null"}) + public double[] getReadLikelihoods( final String sample ) { + return readLikelihoodsPerSample.get(sample); + } + + public Set getSampleKeySet() { + return readLikelihoodsPerSample.keySet(); + } + + public HashMap getEventMap() { + return eventMap; + } + + public void setEventMap( final HashMap eventMap ) { + this.eventMap = eventMap; + } + + public boolean isReference() { + return isRef; + } + + public void setIsReference( boolean isRef ) { + this.isRef = isRef; + } public double getQualitySum() { double s = 0; @@ -87,12 +121,9 @@ public class Haplotype { return s; } + @Override public String toString() { - String returnString = ""; - for(int iii = 0; iii < bases.length; iii++) { - returnString += (char) bases[iii]; - } - return returnString; + return new String(bases); } public double[] getQuals() { @@ -110,15 +141,27 @@ public class Haplotype { return genomeLocation.getStop(); } - public boolean isReference() { - return isReference; + public int getAlignmentStartHapwrtRef() { + return alignmentStartHapwrtRef; } - @Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"}) - public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) { + public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) { + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + } + + public Cigar getCigar() { + return cigar; + } + + public void setCigar( final Cigar cigar ) { + this.cigar = cigar; + } + + @Requires({"refInsertLocation >= 0"}) + public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation ) { if( refAllele.length() != altAllele.length() ) { refInsertLocation++; } - int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStartInRefCoords, haplotypeCigar, refInsertLocation); + int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype return bases.clone(); } @@ -162,7 +205,6 @@ public class Haplotype { public static LinkedHashMap makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize, final int numPrefBases) { - LinkedHashMap haplotypeMap = new LinkedHashMap(); Allele refAllele = null; @@ -195,104 +237,23 @@ public class Haplotype { // Create location for all haplotypes - int startLoc = ref.getWindow().getStart() + startIdxInReference; - int stopLoc = startLoc + haplotypeSize-1; + final int startLoc = ref.getWindow().getStart() + startIdxInReference; + final int stopLoc = startLoc + haplotypeSize-1; - GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); + final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); - for (Allele a : alleleList) { + for (final Allele a : alleleList) { byte[] alleleBases = a.getBases(); // use string concatenation String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant); haplotypeString = haplotypeString.substring(0,haplotypeSize); - haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference())); + haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus)); } return haplotypeMap; } - - private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) { - int readBases = 0; - int refBases = 0; - boolean fallsInsideDeletion = false; - - int goal = refCoord - haplotypeStart; // The goal is to move this many reference bases - boolean goalReached = refBases == goal; - - Iterator cigarElementIterator = haplotypeCigar.getCigarElements().iterator(); - while (!goalReached && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - int shift = 0; - - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { - if (refBases + cigarElement.getLength() < goal) - shift = cigarElement.getLength(); - else - shift = goal - refBases; - - refBases += shift; - } - goalReached = refBases == goal; - - if (!goalReached && cigarElement.getOperator().consumesReadBases()) - readBases += cigarElement.getLength(); - - if (goalReached) { - // Is this base's reference position within this cigar element? Or did we use it all? - boolean endsWithinCigar = shift < cigarElement.getLength(); - - // If it isn't, we need to check the next one. There should *ALWAYS* be a next one - // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) - return -1; - - CigarElement nextCigarElement; - - // if we end inside the current cigar element, we just have to check if it is a deletion - if (endsWithinCigar) - fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. - else { - nextCigarElement = cigarElementIterator.next(); - - // if it's an insertion, we need to clip the whole insertion before looking at the next element - if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { - readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) - return -1; - - nextCigarElement = cigarElementIterator.next(); - } - - // if it's a deletion, we will pass the information on to be handled downstream. - fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; - } - - // If we reached our goal outside a deletion, add the shift - if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) - readBases += shift; - - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar) - readBases += shift - 1; - - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion - else if (fallsInsideDeletion && endsWithinCigar) - readBases--; - } - } - - if (!goalReached) - return -1; - - return (fallsInsideDeletion ? -1 : readBases); - } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java index 74f147127..c6ca39f4b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java @@ -224,10 +224,6 @@ public class IndelUtils { return inds; } - public static String[] getIndelClassificationNames() { - return COLUMN_KEYS; - } - public static String getIndelClassificationName(int k) { if (k >=0 && k < COLUMN_KEYS.length) return COLUMN_KEYS[k]; @@ -235,35 +231,6 @@ public class IndelUtils { throw new ReviewedStingException("Invalid index when trying to get indel classification name"); } - public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) { - ArrayList inds = findEventClassificationIndex(vc, ref); - - boolean isIt = false; - for (int k : inds) { - if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_T) { - isIt = true; - break; - } - } - - return isIt; - - } - public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) { - ArrayList inds = findEventClassificationIndex(vc, ref); - - boolean isIt = false; - for (int k : inds) { - if (k == IND_FOR_REPEAT_EXPANSION_C || k == IND_FOR_REPEAT_EXPANSION_G) { - isIt = true; - break; - } - } - - return isIt; - - } - public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) { return (vc.getStart() != ref.getLocus().getStart()); } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index a96cbffc5..e8b05b525 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -49,20 +49,25 @@ public class MathUtils { } public static final double[] log10Cache; + public static final double[] log10FactorialCache; private static final double[] jacobianLogTable; private static final double JACOBIAN_LOG_TABLE_STEP = 0.001; - private static final double MAX_JACOBIAN_TOLERANCE = 10.0; + private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / 0.001; + private static final double MAX_JACOBIAN_TOLERANCE = 8.0; private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; private static final int MAXN = 11000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients static { log10Cache = new double[LOG10_CACHE_SIZE]; + log10FactorialCache = new double[LOG10_CACHE_SIZE]; jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k = 1; k < LOG10_CACHE_SIZE; k++) + for (int k = 1; k < LOG10_CACHE_SIZE; k++) { log10Cache[k] = Math.log10(k); + log10FactorialCache[k] = log10FactorialCache[k-1] + log10Cache[k]; + } for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); @@ -74,7 +79,7 @@ public class MathUtils { // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). public static int fastRound(double d) { - return (d > 0) ? (int) (d + 0.5d) : (int) (d - 0.5d); + return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); } public static double approximateLog10SumLog10(final double[] vals) { @@ -85,8 +90,6 @@ public class MathUtils { final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); double approxSum = vals[maxElementIndex]; - if (approxSum == Double.NEGATIVE_INFINITY) - return approxSum; for (int i = 0; i < endIndex; i++) { if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) @@ -95,7 +98,7 @@ public class MathUtils { final double diff = approxSum - vals[i]; if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) { // See notes from the 2-inout implementation below - final int ind = fastRound(diff / MathUtils.JACOBIAN_LOG_TABLE_STEP); // hard rounding + final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding approxSum += MathUtils.jacobianLogTable[ind]; } } @@ -103,6 +106,10 @@ public class MathUtils { return approxSum; } + public static double approximateLog10SumLog10(double a, double b, double c) { + return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); + } + public static double approximateLog10SumLog10(double small, double big) { // make sure small is really the smaller value if (small > big) { @@ -124,15 +131,15 @@ public class MathUtils { // max(x,y) + log10(1+10^-abs(x-y)) // we compute the second term as a table lookup with integer quantization // we have pre-stored correction for 0,0.1,0.2,... 10.0 - final int ind = fastRound(diff / MathUtils.JACOBIAN_LOG_TABLE_STEP); // hard rounding + final int ind = fastRound(diff * MathUtils.JACOBIAN_LOG_TABLE_INV_STEP); // hard rounding return big + MathUtils.jacobianLogTable[ind]; } - public static double sum(Collection numbers) { + public static double sum(Collection numbers) { return sum(numbers, false); } - public static double sum(Collection numbers, boolean ignoreNan) { + public static double sum(Collection numbers, boolean ignoreNan) { double sum = 0; for (Number n : numbers) { if (!ignoreNan || !Double.isNaN(n.doubleValue())) { @@ -152,8 +159,8 @@ public class MathUtils { return size; } - public static double average(Collection x) { - return (double) sum(x) / x.size(); + public static double average(Collection x) { + return sum(x) / x.size(); } public static double average(Collection numbers, boolean ignoreNan) { @@ -206,7 +213,7 @@ public class MathUtils { /** * Calculates the log10 cumulative sum of an array with log10 probabilities * - * @param log10p the array with log10 probabilites + * @param log10p the array with log10 probabilities * @param upTo index in the array to calculate the cumsum up to * @return the log10 of the cumulative sum */ @@ -234,7 +241,10 @@ public class MathUtils { public static double log10sumLog10(double[] log10p, int start, int finish) { double sum = 0.0; - double maxValue = Utils.findMaxEntry(log10p); + double maxValue = arrayMax(log10p, finish); + if(maxValue == Double.NEGATIVE_INFINITY) + return maxValue; + for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -548,7 +558,7 @@ public class MathUtils { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = Utils.findMaxEntry(array); + double maxValue = arrayMax(array); // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { @@ -592,12 +602,12 @@ public class MathUtils { } public static int maxElementIndex(final double[] array, final int endIndex) { - if (array == null) + if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); - int maxI = -1; - for (int i = 0; i < endIndex; i++) { - if (maxI == -1 || array[i] > array[maxI]) + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) maxI = i; } @@ -609,22 +619,26 @@ public class MathUtils { } public static int maxElementIndex(final int[] array, int endIndex) { - if (array == null) + if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); - int maxI = -1; - for (int i = 0; i < endIndex; i++) { - if (maxI == -1 || array[i] > array[maxI]) + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) maxI = i; } return maxI; } - public static double arrayMax(double[] array) { + public static double arrayMax(final double[] array) { return array[maxElementIndex(array)]; } + public static double arrayMax(final double[] array, final int endIndex) { + return array[maxElementIndex(array, endIndex)]; + } + public static double arrayMin(double[] array) { return array[minElementIndex(array)]; } @@ -638,12 +652,12 @@ public class MathUtils { } public static int minElementIndex(double[] array) { - if (array == null) + if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); - int minI = -1; - for (int i = 0; i < array.length; i++) { - if (minI == -1 || array[i] < array[minI]) + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) minI = i; } @@ -651,12 +665,12 @@ public class MathUtils { } public static int minElementIndex(byte[] array) { - if (array == null) + if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); - int minI = -1; - for (int i = 0; i < array.length; i++) { - if (minI == -1 || array[i] < array[minI]) + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) minI = i; } @@ -664,12 +678,12 @@ public class MathUtils { } public static int minElementIndex(int[] array) { - if (array == null) + if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); - int minI = -1; - for (int i = 0; i < array.length; i++) { - if (minI == -1 || array[i] < array[minI]) + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) minI = i; } @@ -1048,6 +1062,28 @@ public class MathUtils { } + /** + * Given two log-probability vectors, compute log of vector product of them: + * in Matlab notation, return log10(10.*x'*10.^y) + * @param x vector 1 + * @param y vector 2 + * @return a double representing log (dotProd(10.^x,10.^y) + */ + public static double logDotProduct(double [] x, double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Vectors of different lengths"); + + double tmpVec[] = new double[x.length]; + + for (int k=0; k < tmpVec.length; k++ ) { + tmpVec[k] = x[k]+y[k]; + } + + return log10sumLog10(tmpVec); + + + + } public static Object getMedian(List list) { return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); } @@ -1100,13 +1136,6 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } - public static long sum(Collection x) { - long sum = 0; - for (int v : x) - sum += v; - return sum; - } - /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that @@ -1494,6 +1523,24 @@ public class MathUtils { return result; } + /** Same routine, unboxed types for efficiency + * + * @param x + * @param y + * @return Vector of same length as x and y so that z[k] = x[k]+y[k] + */ + public static double[] vectorSum(double[]x, double[] y) { + if (x.length != y.length) + throw new ReviewedStingException("BUG: Lengths of x and y must be the same"); + + double[] result = new double[x.length]; + for (int k=0; k Double[] scalarTimesVector(E a, E[] v1) { Double result[] = new Double[v1.length]; @@ -1534,124 +1581,4 @@ public class MathUtils { } - /** - * Creates an integer out of a bitset - * - * @param bitSet the bitset - * @return an integer with the bitset representation - */ - public static long intFrom(final BitSet bitSet) { - long number = 0; - for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) - number |= 1L << bitIndex; - - return number; - } - - /** - * Creates a BitSet representation of a given integer - * - * @param number the number to turn into a bitset - * @return a bitset representation of the integer - */ - public static BitSet bitSetFrom(long number) { - BitSet bitSet = new BitSet(); - int bitIndex = 0; - while (number > 0) { - if (number%2 > 0) - bitSet.set(bitIndex); - bitIndex++; - number /= 2; - } - return bitSet; - } - - /** - * Converts a BitSet into the dna string representation. - * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create - * a bitSetFrom(BigNumber) method. - * - * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the - * base_10 representation of the sequence. This is important for us to know how to bring the number - * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented - * as 0's and leading 0's are omitted). - * - * quasi-canonical because A is represented by a 0, therefore, - * instead of : 0, 1, 2, 3, 10, 11, 12, ... - * we have : 0, 1, 2, 3, 00, 01, 02, ... - * - * but we can correctly decode it because we know the final length. - * - * @param bitSet the bitset representation of the dna sequence - * @return the dna sequence represented by the bitset - */ - public static String dnaFrom(final BitSet bitSet) { - long number = intFrom(bitSet); // the base_10 representation of the bit set - long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later) - long nextContext = 4; // the next context (we advance it so we know which one was preceding it). - int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. - while (nextContext <= number) { // find the length of the dna string (i) - preContext = nextContext; // keep track of the number of combinations in the preceding context - nextContext += Math.pow(4, ++i);// calculate the next context - } - number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation - - String dna = ""; - while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) - byte base = (byte) (number % 4); - switch (base) { - case 0 : dna = "A" + dna; break; - case 1 : dna = "C" + dna; break; - case 2 : dna = "G" + dna; break; - case 3 : dna = "T" + dna; break; - } - number /= 4; - } - for (int j = dna.length(); j < i; j++) - dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) - - return dna; - } - - /** - * Creates a BitSet representation of a given dna string. - * - * Warning: This conversion is limited to long precision, therefore the dna sequence cannot - * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create - * a bitSetFrom(BigNumber) method. - * - * The bit representation of a dna string is the simple: - * 0 A 4 AA 8 CA - * 1 C 5 AC ... - * 2 G 6 AG 1343 TTGGT - * 3 T 7 AT 1364 TTTTT - * - * To convert from dna to number, we convert the dna string to base10 and add all combinations that - * preceded the string (with smaller lengths). - * - * @param dna the dna sequence - * @return the bitset representing the dna sequence - */ - public static BitSet bitSetFrom(String dna) { - if (dna.length() > 31) - throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length())); - - long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set - long preContext = 0; // the sum of all combinations that preceded the length of the dna string - for (int i=0; i0) - preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...) - } - - return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java new file mode 100644 index 000000000..9fcb97a4d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class PairHMM { + private static final int MAX_CACHED_QUAL = (int)Byte.MAX_VALUE; + private static final byte DEFAULT_GOP = (byte) 45; + private static final byte DEFAULT_GCP = (byte) 10; + private static final double BANDING_TOLERANCE = 22.0; + private static final int BANDING_CLUSTER_WINDOW = 12; + private final boolean noBanded; + + public PairHMM() { + noBanded = false; + } + + public PairHMM( final boolean noBanded ) { + this.noBanded = noBanded; + } + + + public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, + final int X_METRIC_LENGTH) { + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // initial arrays to hold the probabilities of being in the match, insertion and deletion cases + final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + + return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); + } + + @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability + public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, + final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + if( false ) { + final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step + final ArrayList workToBeAdded = new ArrayList(); + final ArrayList calculatedValues = new ArrayList(); + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; + workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype + + for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over + //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order + int el = 1; + for( int work : workQueue ) { + // choose the appropriate diagonal baseline location + int iii = 0; + int jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work; + jjj -= work; + while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { + iii--; + jjj++; + work--; + } + if( !detectClusteredStartLocations(workToBeAdded, work ) ) { + workToBeAdded.add(work); // keep this thread going once it has started + } + + if( work >= el - 3 ) { + // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value + double maxElement = Double.NEGATIVE_INFINITY; + for( el = work; el < numDiags + 1; el++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + calculatedValues.add(bestMetric); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix + break; + } + if( --jjj <= 0 ) { // don't walk off the edge of the matrix + break; + } + } + + // find a local maximum to start a new band in the work queue + double localMaxElement = Double.NEGATIVE_INFINITY; + int localMaxElementIndex = 0; + for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { + final double bestMetric = calculatedValues.get(kkk); + if( bestMetric > localMaxElement ) { + localMaxElement = bestMetric; + localMaxElementIndex = kkk; + } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum + if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { + workToBeAdded.add( work + localMaxElementIndex ); + } + break; + } + } + calculatedValues.clear(); + + // reset iii and jjj to the appropriate diagonal baseline location + iii = 0; + jjj = diag; + if( diag > Y_METRIC_LENGTH ) { + iii = diag - Y_METRIC_LENGTH; + jjj = Y_METRIC_LENGTH; + } + // move to the starting work location along the diagonal + iii += work-1; + jjj -= work-1; + + // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value + for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, + insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); + final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); + if( bestMetric > maxElement ) { + maxElement = bestMetric; + } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { + break; + } + } + } + } + workQueue.clear(); + workQueue.addAll(workToBeAdded); + workToBeAdded.clear(); + } + } else { + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } + + // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other + private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { + for(int x : list) { + if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { + return true; + } + } + return false; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 7756ac71b..4acc0e2c3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,6 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { + public final static byte MAX_RECALIBRATED_Q_SCORE = 93; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); @@ -22,6 +23,16 @@ public class QualityUtils { for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i); } + private static double qualToErrorProbLog10Cache[] = new double[256]; + static { + for (int i = 0; i < 256; i++) qualToErrorProbLog10Cache[i] = qualToErrorProbLog10Raw(i); + } + + private static double qualToProbLog10Cache[] = new double[256]; + static { + for (int i = 0; i < 256; i++) qualToProbLog10Cache[i] = qualToProbLog10Raw(i); + } + /** * Private constructor. No instantiating this class! */ @@ -31,7 +42,7 @@ public class QualityUtils { * Convert a quality score to a probability. This is the Phred-style * conversion, *not* the Illumina-style conversion (though asymptotically, they're the same). * - * @param qual a quality score (0-40) + * @param qual a quality score (0-255) * @return a probability (0.0-1.0) */ static public double qualToProb(byte qual) { @@ -42,6 +53,14 @@ public class QualityUtils { return 1.0 - Math.pow(10.0, qual/(-10.0)); } + static private double qualToProbLog10Raw(int qual) { + return Math.log10(1.0 - qualToErrorProbRaw(qual)); + } + + static public double qualToProbLog10(byte qual) { + return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + /** * Convert a quality score to a probability of error. This is the Phred-style * conversion, *not* the Illumina-style conversion (though asymptotically, they're the same). @@ -57,14 +76,14 @@ public class QualityUtils { return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. } - static public double[] qualArrayToLog10ErrorProb(byte[] quals) { - double[] returnArray = new double[quals.length]; - for( int iii = 0; iii < quals.length; iii++ ) { - returnArray[iii] = ((double) quals[iii])/-10.0; - } - return returnArray; + static private double qualToErrorProbLog10Raw(int qual) { + return ((double) qual)/-10.0; } - + + static public double qualToErrorProbLog10(byte qual) { + return qualToErrorProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * @@ -85,9 +104,8 @@ public class QualityUtils { */ static public byte probToQual(double prob, double eps) { double lp = Math.round(-10.0*Math.log10(1.0 - prob + eps)); - byte b = boundQual((int)lp); //System.out.printf("LP is %f, byte is %d%n", lp, b); - return b; + return boundQual((int)lp); } static public double phredScaleCorrectRate(double trueRate) { @@ -98,10 +116,6 @@ public class QualityUtils { return Math.abs(-10.0*Math.log10(errorRate)); } - static public double lodToPhredScaleErrorRate(double lod) { - return phredScaleErrorRate(1.0 / (Math.pow(10.0, lod) + 1.0)); - } - /** * Return a quality score, capped at max qual. * @@ -115,12 +129,11 @@ public class QualityUtils { /** * Returns an integer quality score bounded by 1 - maxQual. * - * @param qual - * @param maxQual - * @return + * @param qual the quality score + * @param maxQual the maximum quality + * @return the integer betwen 1 and maxqual. */ static public byte boundQual(int qual, byte maxQual) { - //return (byte) Math.min(qual, maxQual); return (byte) Math.max(Math.min(qual, maxQual), 1); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java b/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java new file mode 100644 index 000000000..b52eed5cf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/R/RUtils.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.apache.commons.lang.StringUtils; + +import java.text.SimpleDateFormat; +import java.util.Collection; +import java.util.Date; + +public class RUtils { + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the values will be escaped with single quotes and combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toStringList(Collection list) { + if (list == null) + return "NA"; + if (list.size() == 0) + return "c()"; + return "c('" + StringUtils.join(list, "','") + "')"; + } + + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the values will be combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toNumberList(Collection list) { + return list == null ? "NA": "c(" + StringUtils.join(list, ",") + ")"; + } + + /** + * Converts a collection of values to an R compatible list. A null list will return NA, + * otherwise the date will be escaped with single quotes and combined with c(). + * @param list Collection of values + * @return The R representation of the list + */ + public static String toDateList(Collection list) { + return toDateList(list, "''yyyy-MM-dd''"); + } + + /** + * Converts a collection of values to an R compatible list formatted by pattern. + * @param list Collection of values + * @param pattern format pattern string for each date + * @return The R representation of the list + */ + public static String toDateList(Collection list, String pattern) { + + if (list == null) + return "NA"; + SimpleDateFormat format = new SimpleDateFormat(pattern); + StringBuilder sb = new StringBuilder(); + sb.append("c("); + boolean first = true; + for (Date date : list) { + if (!first) sb.append(","); + sb.append(format.format(date)); + first = false; + } + sb.append(")"); + return sb.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 68b220aab..360a855fa 100755 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -31,14 +31,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.text.ListFileUtils; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** @@ -74,10 +73,10 @@ public class SampleUtils { * Same as @link getSAMFileSamples but gets all of the samples * in the SAM files loaded by the engine * - * @param engine - * @return + * @param engine engine + * @return samples */ - public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) { + public static Set getSAMFileSamples(GenomeAnalysisEngine engine) { return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader()); } @@ -209,89 +208,24 @@ public class SampleUtils { * we try to read a file named E from disk, and if possible all lines from that file are expanded * into unique sample names. * - * @param sampleArgs - * @return + * @param sampleArgs args + * @return samples */ public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { if (sampleArgs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String SAMPLE_EXPRESSION : sampleArgs) { - File sampleFile = new File(SAMPLE_EXPRESSION); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line.trim()); - } - } catch (FileNotFoundException e) { - samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample - } - } - - return samplesFromFiles; + return ListFileUtils.unpackSet(sampleArgs); } return new HashSet(); } public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { - Set samples = new HashSet(); - - if (sampleExpressions != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String sampleExpression : sampleExpressions) { - File sampleFile = new File(sampleExpression); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line); - } - } catch (FileNotFoundException e) { - // ignore exception - } - } - - sampleExpressions.addAll(samplesFromFiles); - - // Let's now assume that the values in sampleExpressions are literal sample names and not regular - // expressions. Extract those samples specifically so we don't make the mistake of selecting more - // than what the user really wants. - Set possibleSampleRegexs = new HashSet(); - for (String sampleExpression : sampleExpressions) { - if (!(new File(sampleExpression).exists())) { - if (vcfSamples.contains(sampleExpression)) { - samples.add(sampleExpression); - } else { - possibleSampleRegexs.add(sampleExpression); - } - } - } - - // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions - for (String sampleRegex : possibleSampleRegexs) { - Pattern p = Pattern.compile(sampleRegex); - - for (String vcfSample : vcfSamples) { - Matcher m = p.matcher(vcfSample); - if (m.find()) { - samples.add(vcfSample); - } - } - } + Set samples = ListFileUtils.unpackSet(vcfSamples); + if (sampleExpressions == null) { + return samples; } else { - samples.addAll(vcfSamples); + return ListFileUtils.includeMatching(samples, sampleExpressions, false); } - - return samples; } /** @@ -304,16 +238,7 @@ public class SampleUtils { // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions Set samples = new HashSet(); if (sampleExpressions != null) { - for (String expression : sampleExpressions) { - Pattern p = Pattern.compile(expression); - - for (String originalSample : originalSamples) { - Matcher m = p.matcher(originalSample); - if (m.find()) { - samples.add(originalSample); - } - } - } + samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false)); } return samples; } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 10bc050da..7b627fba2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -25,9 +25,15 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.util.StringUtil; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.net.InetAddress; import java.util.*; @@ -284,32 +290,6 @@ public class Utils { return m; } - - // returns the maximum value in the array - public static double findMaxEntry(double[] array) { - return findIndexAndMaxEntry(array).first; - } - - // returns the index of the maximum value in the array - public static int findIndexOfMaxEntry(double[] array) { - return findIndexAndMaxEntry(array).second; - } - - // returns the the maximum value and its index in the array - private static Pair findIndexAndMaxEntry(double[] array) { - if ( array.length == 0 ) - return new Pair(0.0, -1); - int index = 0; - double max = array[0]; - for (int i = 1; i < array.length; i++) { - if ( array[i] > max ) { - max = array[i]; - index = i; - } - } - return new Pair(max, index); - } - /** * Splits expressions in command args by spaces and returns the array of expressions. * Expressions may use single or double quotes to group any individual expression, but not both. @@ -668,4 +648,120 @@ public class Utils { array[i] = value; } + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); + + SAMFileHeader header = toolkit.getSAMFileHeader(); + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) + if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + + writer.writeHeader(header); + writer.setPresorted(preSorted); + } + + public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + try { + final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); + programRecord.setProgramVersion(version); + } catch (MissingResourceException e) { + // couldn't care less if the resource is missing... + } + programRecord.setCommandLine(toolkit.createApproximateCommandLineArgumentString(toolkit, walker)); + return programRecord; + } + + public static Collection makeCollection(Iterable iter) { + Collection list = new ArrayList(); + for (E item : iter) { + list.add(item); + } + return list; + } + + /** + * Returns the number of combinations represented by this collection + * of collection of options. + * + * For example, if this is [[A, B], [C, D], [E, F, G]] returns 2 * 2 * 3 = 12 + * + * @param options + * @param + * @return + */ + @Requires("options != null") + public static int nCombinations(final Collection[] options) { + int nStates = 1; + for ( Collection states : options ) { + nStates *= states.size(); + } + return nStates; + } + + @Requires("options != null") + public static int nCombinations(final List> options) { + if ( options.isEmpty() ) + return 0; + else { + int nStates = 1; + for ( Collection states : options ) { + nStates *= states.size(); + } + return nStates; + } + } + + /** + * Convenience function that formats the novelty rate as a %.2f string + * + * @param known number of variants from all that are known + * @param all number of all variants + * @return a String novelty rate, or NA if all == 0 + */ + public static String formattedNoveltyRate(final int known, final int all) { + return formattedPercent(all - known, all); + } + + /** + * Convenience function that formats the novelty rate as a %.2f string + * + * @param x number of objects part of total that meet some criteria + * @param total count of all objects, including x + * @return a String percent rate, or NA if total == 0 + */ + public static String formattedPercent(final long x, final long total) { + return total == 0 ? "NA" : String.format("%.2f", (100.0*x) / total); + } + + /** + * Convenience function that formats a ratio as a %.2f string + * + * @param num number of observations in the numerator + * @param denom number of observations in the denumerator + * @return a String formatted ratio, or NA if all == 0 + */ + public static String formattedRatio(final long num, final long denom) { + return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); + } + + /** + * Create a constant map that maps each value in values to itself + * @param values + * @param + * @return + */ + public static Map makeIdentityFunctionMap(Collection values) { + Map map = new HashMap(values.size()); + for ( final T value : values ) + map.put(value, value); + return Collections.unmodifiableMap(map); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 6279e0061..764be2ac7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -4,6 +4,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -14,7 +15,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation { +public class ActiveRegion implements HasGenomeLocation, Comparable { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -33,24 +34,50 @@ public class ActiveRegion implements HasGenomeLocation { fullExtentReferenceLoc = extendedLoc; } + @Override + public String toString() { + return "ActiveRegion " + activeRegionLoc.toString(); + } + // add each read to the bin and extend the reference genome activeRegionLoc if needed public void add( final GATKSAMRecord read ) { fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); reads.add( read ); } + + public void hardClipToActiveRegion() { + final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, activeRegionLoc.getStart(), activeRegionLoc.getStop() ); + reads.clear(); + reads.addAll(clippedReads); + } public ArrayList getReads() { return reads; } - public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { - return getReference( referenceReader, 0 ); + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + return getActiveRegionReference(referenceReader, 0); } - public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return referenceReader.getSubsequenceAt( activeRegionLoc.getContig(), + Math.max(1, activeRegionLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(activeRegionLoc.getContig()).getSequenceLength(), activeRegionLoc.getStop() + padding) ).getBases(); + } + + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + return getFullReference(referenceReader, 0); + } + + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), Math.max(1, fullExtentReferenceLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } + @Override + public int compareTo( final ActiveRegion other ) { + return this.getLocation().compareTo(other.getLocation()); + } + @Override public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; } @@ -61,4 +88,13 @@ public class ActiveRegion implements HasGenomeLocation { public void clearReads() { reads.clear(); } public void remove( final GATKSAMRecord read ) { reads.remove( read ); } public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } + + public boolean equalExceptReads(final ActiveRegion other) { + if ( ! activeRegionLoc.equals(other.activeRegionLoc)) return false; + if ( isActive != other.isActive ) return false; + if ( genomeLocParser != other.genomeLocParser ) return false; + if ( extension != other.extension ) return false; + if ( ! extendedLoc.equals(other.extendedLoc) ) return false; + return true; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java new file mode 100644 index 000000000..70593bbed --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Class holding information about per-base activity scores for the + * active region traversal + * + * @author Mark DePristo + * @since Date created + */ +public class ActivityProfile { + final GenomeLocParser parser; + final boolean presetRegions; + GenomeLoc regionStartLoc = null; + final List isActiveList; + private GenomeLoc lastLoc = null; + private static final int FILTER_SIZE = 65; + private static final Double[] GaussianKernel; + + static { + GaussianKernel = new Double[2*FILTER_SIZE + 1]; + for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { + GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii); + } + } + + // todo -- add upfront the start and stop of the intervals + // todo -- check that no regions are unexpectedly missing + // todo -- add unit tests + // TODO -- own preset regions + public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) { + this(parser, presetRegions, new ArrayList(), null); + } + + protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List isActiveList, final GenomeLoc regionStartLoc) { + this.parser = parser; + this.presetRegions = presetRegions; + this.isActiveList = isActiveList; + this.regionStartLoc = regionStartLoc; + } + + public void add(final GenomeLoc loc, final double score) { + if ( loc.size() != 1 ) + throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" ); + if ( lastLoc != null && loc.getStart() != lastLoc.getStop() + 1 ) + throw new ReviewedStingException("Bad add call to ActivityProfile: lastLoc added " + lastLoc + " and next is " + loc); + isActiveList.add(score); + if( regionStartLoc == null ) { + regionStartLoc = loc; + } + } + + public int size() { + return isActiveList.size(); + } + + /** + * Band pass this ActivityProfile, producing a new profile that's band pass filtered + * @return a new ActivityProfile that's the band-pass filtered version of this profile + */ + public ActivityProfile bandPassFilter() { + final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); + final Double[] filteredProbArray = new Double[activeProbArray.length]; + if( !presetRegions ) { + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); + final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); + filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + } + } + return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); + } + + /** + * Partition this profile into active regions + * @param activeRegionExtension + * @return + */ + public List createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) { + final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author + final ArrayList returnList = new ArrayList(); + + if( isActiveList.size() == 0 ) { + // no elements in the active list, just return an empty one + return Collections.emptyList(); + } else if( isActiveList.size() == 1 ) { + // there's a single element, it's either active or inactive + boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; + returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize)); + } else { + // there are 2+ elements, divide these up into regions + boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; + int curStart = 0; + for(int iii = 1; iii < isActiveList.size(); iii++ ) { + final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; + if( isActive != thisStatus ) { + returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize)); + isActive = thisStatus; + curStart = iii; + } + } + returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region + } + return returnList; + } + + /** + * Helper routine to create an active region based on our current start and end offsets + * @param isActive should the region be active? + * @param curStart offset (0-based) from the start of this region + * @param curEnd offset (0-based) from the start of this region + * @param activeRegionExtension + * @return a fully initialized ActiveRegion with the above properties + */ + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { + return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { + if( !isActive || curEnd - curStart < maxRegionSize ) { + final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); + returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension)); + return returnList; + } + // find the best place to break up the large active region + Double minProb = Double.MAX_VALUE; + int cutPoint = -1; + for( int iii = curStart + 50; iii < curEnd - 50; iii++ ) { // BUGBUG: assumes maxRegionSize >> 50 + if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; } + } + final List leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList()); + final List rightList = createActiveRegion(isActive, cutPoint+1, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + returnList.addAll( leftList ); + returnList.addAll( rightList ); + return returnList; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 62a67a1f2..2e3978ddb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -4,7 +4,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -320,8 +320,8 @@ public class ClippingOp { byte[] newBaseDeletionQuals = new byte[newLength]; System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); - hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); - hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION); + hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); } return hardClippedRead; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 7a664bd61..9e7ee9dac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -231,15 +231,16 @@ public class ReadClipper { /** - * Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual. + * Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the desired algorithm. * * This function will look for low quality tails and hard clip them away. A low quality tail * ends when a base has base quality greater than lowQual. * + * @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...) * @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped * @return a new read without low quality tails */ - private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { + private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) { if (read.isEmpty()) return read; @@ -254,7 +255,6 @@ public class ReadClipper { // if the entire read should be clipped, then return an empty read. if (leftClipIndex > rightClipIndex) return GATKSAMRecord.emptyRead(read); -// return (new GATKSAMRecord(read.getHeader())); if (rightClipIndex < read.getReadLength() - 1) { this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1)); @@ -262,11 +262,18 @@ public class ReadClipper { if (leftClipIndex > 0 ) { this.addOp(new ClippingOp(0, leftClipIndex - 1)); } - return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); + return this.clipRead(algorithm); + } + + private GATKSAMRecord hardClipLowQualEnds(byte lowQual) { + return this.clipLowQualEnds(ClippingRepresentation.HARDCLIP_BASES, lowQual); } public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); } + public static GATKSAMRecord clipLowQualEnds(GATKSAMRecord read, byte lowQual, ClippingRepresentation algorithm) { + return (new ReadClipper(read)).clipLowQualEnds(algorithm, lowQual); + } /** @@ -312,6 +319,42 @@ public class ReadClipper { } + /** + * Hard clip the read to the variable region (from refStart to refStop) + * + * @param read the read to be clipped + * @param refStart the beginning of the variant region (inclusive) + * @param refStop the end of the variant region (inclusive) + * @return the read hard clipped to the variant region + */ + public static GATKSAMRecord hardClipToRegion( final GATKSAMRecord read, final int refStart, final int refStop ) { + final int start = read.getAlignmentStart(); + final int stop = read.getAlignmentEnd(); + + // check if the read is contained in region + if (start <= refStop && stop >= refStart) { + if (start < refStart && stop > refStop) + return hardClipBothEndsByReferenceCoordinates(read, refStart - 1, refStop + 1); + else if (start < refStart) + return hardClipByReferenceCoordinatesLeftTail(read, refStart - 1); + else if (stop > refStop) + return hardClipByReferenceCoordinatesRightTail(read, refStop + 1); + return read; + } else + return GATKSAMRecord.emptyRead(read); + + } + public static ArrayList hardClipToRegion( final ArrayList reads, final int refStart, final int refStop ) { + final ArrayList returnList = new ArrayList( reads.size() ); + for( final GATKSAMRecord read : reads ) { + final GATKSAMRecord clippedRead = hardClipToRegion( read, refStart, refStop ); + if( !clippedRead.isEmpty() ) { + returnList.add( clippedRead ); + } + } + return returnList; + } + /** * Checks if a read contains adaptor sequences. If it does, hard clips them out. * diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index efcd3ecf0..cb392f29c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -73,6 +73,8 @@ public class RefSeqCodec implements ReferenceDependentFeatureCodec> filterHash = new HashMap>(); @@ -154,18 +155,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data"); } else { - if ( str.startsWith("##INFO=") ) { - VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); + if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { + final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); metaData.add(info); - infoFields.put(info.getName(), info.getType()); - } else if ( str.startsWith("##FILTER=") ) { - VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version); + infoFields.put(info.getID(), info.getType()); + } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { + final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); metaData.add(filter); - filterFields.add(filter.getName()); - } else if ( str.startsWith("##FORMAT=") ) { - VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version); + filterFields.add(filter.getID()); + } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { + final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); metaData.add(format); - formatFields.put(format.getName(), format.getType()); + formatFields.put(format.getID(), format.getType()); + } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { + final VCFSimpleHeaderLine contig = new VCFSimpleHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), null); + metaData.add(contig); + } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { + final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")); + metaData.add(alt); } else { int equals = str.indexOf("="); if ( equals != -1 ) @@ -192,8 +199,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // our header cannot be null, we need the genotype sample names and counts if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record"); - final String[] locParts = new String[6]; - int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + final int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); if ( nParts != 6 ) throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); @@ -215,7 +221,23 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { // ref alleles don't need to be single bases for monomorphic sites if ( alleles.size() == 1 ) { stop = start + alleles.get(0).length() - 1; - } else if ( !isSingleNucleotideEvent(alleles) ) { + } + // we need to parse the INFO field to check for an END tag if it's a symbolic allele + else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() ) { + final String[] extraParts = new String[4]; + final int nExtraParts = ParsingUtils.split(locParts[5], extraParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + if ( nExtraParts < 3 ) + throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo); + + final Map attrs = parseInfo(extraParts[2]); + try { + stop = attrs.containsKey(VCFConstants.END_KEY) ? Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()) : start; + } catch (Exception e) { + throw new UserException.MalformedVCF("the END value in the INFO field is not valid for line " + line, lineNo); + } + } + // handle multi-positional events + else if ( !isSingleNucleotideEvent(alleles) ) { stop = clipAlleles(start, ref, alleles, null, lineNo); } @@ -305,22 +327,31 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { String alts = getCachedString(parts[4].toUpperCase()); builder.log10PError(parseQual(parts[5])); builder.filters(parseFilters(getCachedString(parts[6]))); - builder.attributes(parseInfo(parts[7])); + final Map attrs = parseInfo(parts[7]); + builder.attributes(attrs); // get our alleles, filters, and setup an attribute map List alleles = parseAlleles(ref, alts, lineNo); // find out our current location, and clip the alleles down to their minimum length - int loc = pos; + int stop = pos; // ref alleles don't need to be single bases for monomorphic sites if ( alleles.size() == 1 ) { - loc = pos + alleles.get(0).length() - 1; + stop = pos + alleles.get(0).length() - 1; + } + // we need to parse the INFO field to check for an END tag if it's a symbolic allele + else if ( alleles.size() == 2 && alleles.get(1).isSymbolic() && attrs.containsKey(VCFConstants.END_KEY) ) { + try { + stop = Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString()); + } catch (Exception e) { + generateException("the END value in the INFO field is not valid"); + } } else if ( !isSingleNucleotideEvent(alleles) ) { ArrayList newAlleles = new ArrayList(); - loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo); + stop = clipAlleles(pos, ref, alleles, newAlleles, lineNo); alleles = newAlleles; } - builder.stop(loc); + builder.stop(stop); builder.alleles(alleles); // do we have genotyping data @@ -344,7 +375,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { generateException(e.getMessage()); } - return vc; } @@ -443,7 +473,12 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { protected static Allele oneAllele(String index, List alleles) { if ( index.equals(VCFConstants.EMPTY_ALLELE) ) return Allele.NO_CALL; - int i = Integer.valueOf(index); + final int i; + try { + i = Integer.valueOf(index); + } catch ( NumberFormatException e ) { + throw new TribbleException.InternalCodecException("The following invalid GT allele index was encountered in the file: " + index); + } if ( i >= alleles.size() ) throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record"); return alleles.get(i); @@ -582,13 +617,15 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { return true; } - public static int computeForwardClipping(List unclippedAlleles, String ref) { + public static int computeForwardClipping(final List unclippedAlleles, final byte ref0) { boolean clipping = true; - final byte ref0 = (byte)ref.charAt(0); + int symbolicAlleleCount = 0; for ( Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) + if ( a.isSymbolic() ) { + symbolicAlleleCount++; continue; + } if ( a.length() < 1 || (a.getBases()[0] != ref0) ) { clipping = false; @@ -596,29 +633,36 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { } } - return (clipping) ? 1 : 0; + // don't clip if all alleles are symbolic + return (clipping && symbolicAlleleCount != unclippedAlleles.size()) ? 1 : 0; } - protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip, final int lineNo) { int clipping = 0; boolean stillClipping = true; while ( stillClipping ) { - for ( Allele a : unclippedAlleles ) { + for ( final Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) continue; // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). if ( a.length() - clipping == 0 ) - return clipping - 1; + return clipping - (allowFullClip ? 0 : 1); - if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { stillClipping = false; - else if ( ref.length() == clipping ) - generateException("bad alleles encountered", lineNo); - else if ( a.getBases()[a.length()-clipping-1] != ((byte)ref.charAt(ref.length()-clipping-1)) ) + } + else if ( ref.length == clipping ) { + if ( allowFullClip ) + stillClipping = false; + else + generateException("bad alleles encountered", lineNo); + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { stillClipping = false; + } } if ( stillClipping ) clipping++; @@ -639,8 +683,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec { */ protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - int forwardClipping = computeForwardClipping(unclippedAlleles, ref); - int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); + int forwardClipping = computeForwardClipping(unclippedAlleles, (byte)ref.charAt(0)); + int reverseClipping = computeReverseClipping(unclippedAlleles, ref.getBytes(), forwardClipping, false, lineNo); if ( clippedAlleles != null ) { for ( Allele a : unclippedAlleles ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java deleted file mode 100644 index a9de949d8..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFAltHeaderLine.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.vcf; - -/** - * @author ebanks - * A class representing a key=value entry for ALT fields in the VCF header - */ -public class VCFAltHeaderLine extends VCFSimpleHeaderLine { - - /** - * create a VCF filter header line - * - * @param name the name for this header line - * @param description the description for this header line - */ - public VCFAltHeaderLine(String name, String description) { - super(name, description, SupportedHeaderLineType.ALT); - } - - /** - * create a VCF info header line - * - * @param line the header line - * @param version the vcf header version - */ - protected VCFAltHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.ALT); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 97166833b..d2bd507b5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -34,7 +34,7 @@ import java.util.Map; /** * a base class for compound header lines, which include info lines and format lines (so far) */ -public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { +public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { public enum SupportedHeaderLineType { INFO(true), FORMAT(false); @@ -52,7 +52,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF private VCFHeaderLineType type; // access methods - public String getName() { return name; } + public String getID() { return name; } public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } public VCFHeaderLineCount getCountType() { return countType; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index 8e9d989cc..b23371cc9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -80,6 +80,13 @@ public final class VCFConstants { public static final String PHASED_SWITCH_PROB_v3 = "\\"; public static final String PHASING_TOKENS = "/|\\"; + // header lines + public static final String FILTER_HEADER_START = "##FILTER"; + public static final String FORMAT_HEADER_START = "##FORMAT"; + public static final String INFO_HEADER_START = "##INFO"; + public static final String ALT_HEADER_START = "##ALT"; + public static final String CONTIG_HEADER_START = "##contig"; + // old indel alleles public static final char DELETION_ALLELE_v3 = 'D'; public static final char INSERTION_ALLELE_v3 = 'I'; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java index 418b80074..dd0a333f3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFFilterHeaderLine.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import java.util.Arrays; + /** * @author ebanks * A class representing a key=value entry for FILTER fields in the VCF header @@ -13,7 +15,7 @@ public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { * @param description the description for this header line */ public VCFFilterHeaderLine(String name, String description) { - super(name, description, SupportedHeaderLineType.FILTER); + super("FILTER", name, description); } /** @@ -23,6 +25,6 @@ public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { * @param version the vcf header version */ protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FILTER); + super(line, version, "FILTER", Arrays.asList("ID", "Description")); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 5c5df15ab..50ff3a656 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -1,8 +1,30 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.codecs.vcf; - import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.Genotype; import java.util.*; @@ -36,6 +58,11 @@ public class VCFHeader { // the header string indicator public static final String HEADER_INDICATOR = "#"; + public static final String SOURCE_KEY = "source"; + public static final String REFERENCE_KEY = "reference"; + public static final String CONTIG_KEY = "contig"; + public static final String INTERVALS_KEY = "intervals"; + // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; @@ -43,6 +70,8 @@ public class VCFHeader { protected ArrayList sampleNamesInOrder = null; protected HashMap sampleNameToOffset = null; + private boolean writeEngineHeaders = true; + private boolean writeCommandLine = true; /** * create a VCF header, given a list of meta data and auxillary tags @@ -80,6 +109,7 @@ public class VCFHeader { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * + * @param genotypeSampleNamesInAppearenceOrder genotype sample names */ protected void buildVCFReaderMaps(List genotypeSampleNamesInAppearenceOrder) { @@ -126,11 +156,11 @@ public class VCFHeader { for ( VCFHeaderLine line : mMetaData ) { if ( line instanceof VCFInfoHeaderLine ) { VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - mInfoMetaData.put(infoLine.getName(), infoLine); + mInfoMetaData.put(infoLine.getID(), infoLine); } else if ( line instanceof VCFFormatHeaderLine ) { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - mFormatMetaData.put(formatLine.getName(), formatLine); + mFormatMetaData.put(formatLine.getID(), formatLine); } else { mOtherMetaData.put(line.getKey(), line); @@ -145,10 +175,7 @@ public class VCFHeader { * @return a set of the header fields, in order */ public Set getHeaderFields() { - Set fields = new LinkedHashSet(); - for (HEADER_FIELDS field : HEADER_FIELDS.values()) - fields.add(field); - return fields; + return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); } /** @@ -218,7 +245,36 @@ public class VCFHeader { public VCFHeaderLine getOtherHeaderLine(String key) { return mOtherMetaData.get(key); } + + /** + * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. + * @return true if additional engine headers will be written to the VCF + */ + public boolean isWriteEngineHeaders() { + return writeEngineHeaders; + } + + /** + * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. + * @param writeEngineHeaders true if additional engine headers will be written to the VCF + */ + public void setWriteEngineHeaders(boolean writeEngineHeaders) { + this.writeEngineHeaders = writeEngineHeaders; + } + + /** + * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. + * @return true if the command line will be written to the VCF + */ + public boolean isWriteCommandLine() { + return writeCommandLine; + } + + /** + * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. + * @param writeCommandLine true if the command line will be written to the VCF + */ + public void setWriteCommandLine(boolean writeCommandLine) { + this.writeCommandLine = writeCommandLine; + } } - - - diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java index e39a09cb1..88fed75d7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeaderLineTranslator.java @@ -73,10 +73,14 @@ class VCF4Parser implements VCFLineParser { // validate the tags against the expected list index = 0; - if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size()); - for (String str : ret.keySet()) { - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); - index++; + if ( expectedTagOrder != null ) { + if ( ret.size() > expectedTagOrder.size() ) + throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size()); + for ( String str : ret.keySet() ) { + if ( !expectedTagOrder.get(index).equals(str) ) + throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + index++; + } } return ret; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFIDHeaderLine.java similarity index 91% rename from public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFIDHeaderLine.java index f78e936b2..65321881a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFNamedHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFIDHeaderLine.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; -/** an interface for named header lines **/ -public interface VCFNamedHeaderLine { - String getName(); +/** an interface for ID-based header lines **/ +public interface VCFIDHeaderLine { + String getID(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java index 152043f28..05d603073 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; -import java.util.Arrays; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; @@ -9,34 +9,35 @@ import java.util.Map; * @author ebanks * A class representing a key=value entry for simple VCF header types */ -public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine { - - public enum SupportedHeaderLineType { - FILTER, ALT; - } +public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { private String name; - private String description; - - // our type of line, i.e. filter, alt, etc - private final SupportedHeaderLineType lineType; - + private Map genericFields = new LinkedHashMap(); /** * create a VCF filter header line * - * @param name the name for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param key the key for this header line + * @param name the name for this header line + * @param genericFields other fields for this header line */ - public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.lineType = lineType; - this.name = name; - this.description = description; + public VCFSimpleHeaderLine(String key, String name, Map genericFields) { + super(key, ""); + initialize(name, genericFields); + } - if ( name == null || description == null ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description )); + /** + * create a VCF filter header line + * + * @param key the key for this header line + * @param name the name for this header line + * @param description description for this header line + */ + public VCFSimpleHeaderLine(String key, String name, String description) { + super(key, ""); + Map map = new LinkedHashMap(1); + map.put("Description", description); + initialize(name, map); } /** @@ -44,38 +45,50 @@ public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNa * * @param line the header line * @param version the vcf header version - * @param lineType the header line type + * @param key the key for this header line + * @param expectedTagOrdering the tag ordering expected for this header line */ - protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); - this.lineType = lineType; - Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); + protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, String key, List expectedTagOrdering) { + super(key, ""); + Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering); name = mapping.get("ID"); - description = mapping.get("Description"); - if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; + initialize(name, mapping); + } + + protected void initialize(String name, Map genericFields) { + if ( name == null || genericFields == null || genericFields.isEmpty() ) + throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name)); + + this.name = name; + this.genericFields.putAll(genericFields); } protected String toStringEncoding() { - Map map = new LinkedHashMap(); + Map map = new LinkedHashMap(); map.put("ID", name); - map.put("Description", description); - return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + map.putAll(genericFields); + return getKey() + "=" + VCFHeaderLine.toStringEncoding(map); } public boolean equals(Object o) { if ( !(o instanceof VCFSimpleHeaderLine) ) return false; VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o; - return name.equals(other.name) && - description.equals(other.description); + if ( !name.equals(other.name) || genericFields.size() != other.genericFields.size() ) + return false; + for ( Map.Entry entry : genericFields.entrySet() ) { + if ( !entry.getValue().equals(other.genericFields.get(entry.getKey())) ) + return false; + } + + return true; } - public String getName() { + public String getID() { return name; } - public String getDescription() { - return description; + public Map getGenericFields() { + return genericFields; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 5bd6a9b32..238a06243 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -155,10 +155,10 @@ public class VCFUtils { for ( VCFHeader source : headers ) { //System.out.printf("Merging in header %s%n", source); for ( VCFHeaderLine line : source.getMetaData()) { - String key = line.getKey(); - if ( line instanceof VCFNamedHeaderLine) - key = key + "" + ((VCFNamedHeaderLine) line).getName(); + String key = line.getKey(); + if ( line instanceof VCFIDHeaderLine ) + key = key + "-" + ((VCFIDHeaderLine)line).getID(); if ( map.containsKey(key) ) { VCFHeaderLine other = map.get(key); @@ -166,8 +166,8 @@ public class VCFUtils { continue; else if ( ! line.getClass().equals(other.getClass()) ) throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - else if ( line instanceof VCFFilterHeaderLine) { - String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName(); + else if ( line instanceof VCFFilterHeaderLine ) { + String lineName = ((VCFFilterHeaderLine) line).getID(); String otherName = ((VCFFilterHeaderLine) other).getID(); if ( ! lineName.equals(otherName) ) throw new IllegalStateException("Incompatible header types: " + line + " " + other ); } else if ( line instanceof VCFCompoundHeaderLine ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index d625cec20..fd0cf7869 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -107,6 +107,12 @@ public class UserException extends ReviewedStingException { } } + public static class NotEnoughMemory extends UserException { + public NotEnoughMemory() { + super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); + } + } + public static class ErrorWritingBamFile extends UserException { public ErrorWritingBamFile(String message) { super(String.format("An error occurred when trying to write the BAM file. Usually this happens when there is not enough space in the directory to which the data is being written (generally the temp directory) or when your system's open file handle limit is too small. To tell Java to use a bigger/better file system use -Djava.io.tmpdir=X on the command line. The exact error was %s", message)); @@ -197,7 +203,11 @@ public class UserException extends ReviewedStingException { public static class MalformedVCF extends UserException { public MalformedVCF(String message, String line) { - super(String.format("The provided VCF file is malformed at approximately line %s: %s", line, message)); + super(String.format("The provided VCF file is malformed at line %s: %s", line, message)); + } + + public MalformedVCF(String message) { + super(String.format("The provided VCF file is malformed: %s", message)); } public MalformedVCF(String message, int lineNo) { diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index eea45567f..858f7a2ae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -4,7 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -203,8 +203,8 @@ public class FragmentUtils { insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop]; deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop]; } - returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION ); - returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION ); + returnRead.setBaseQualities( insertionQuals, EventType.BASE_INSERTION ); + returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION ); } final ArrayList returnList = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 7c2a67aba..bcd220dca 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup pileup = new UnifiedPileupElementTracker(); for (GATKSAMRecord read : reads) { - pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important + pileup.add(createNewPileupElement(read, offset, false, false, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important } return pileup; @@ -204,8 +204,8 @@ public abstract class AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip, String nextEventBases, int nextEventLength ); + protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip); + protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ); // -------------------------------------------------------- // @@ -677,11 +677,11 @@ public abstract class AbstractReadBackedPileup filteredElements = tracker.getElements(sampleNames); return filteredElements != null ? (RBP) createNewPileup(loc, filteredElements) : null; } else { - HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); for (PE p : pileupElementTracker) { GATKSAMRecord read = p.getRead(); - if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); } else { @@ -693,6 +693,32 @@ public abstract class AbstractReadBackedPileup getPileupsForSamples(Collection sampleNames) { + Map result = new HashMap(); + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + + for (PE p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + + for (Map.Entry> entry : trackerMap.entrySet()) // create the RBP for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); + + return result; + } + @Override public RBP getPileupForSample(String sampleName) { @@ -873,6 +899,26 @@ public abstract class AbstractReadBackedPileup { public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89; public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90; - protected final GATKSAMRecord read; - protected final int offset; - protected final boolean isDeletion; - protected final boolean isBeforeDeletion; - protected final boolean isBeforeInsertion; - protected final boolean isNextToSoftClip; - protected final int eventLength; - protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases - // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases - + protected final GATKSAMRecord read; // the read this base belongs to + protected final int offset; // the offset in the bases array for this base + protected final boolean isDeletion; // is this base a deletion + protected final boolean isBeforeDeletedBase; // is the base to the right of this base an deletion + protected final boolean isAfterDeletedBase; // is the base to the left of this base a deletion + protected final boolean isBeforeInsertion; // is the base to the right of this base an insertion + protected final boolean isAfterInsertion; // is the base to the left of this base an insertion + protected final boolean isNextToSoftClip; // is this base either before or after a soft clipped base + protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base + protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases /** * Creates a new pileup element. @@ -39,7 +39,9 @@ public class PileupElement implements Comparable { * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) * @param isDeletion whether or not this base is a deletion * @param isBeforeDeletion whether or not this base is before a deletion + * @param isAfterDeletion whether or not this base is after a deletion * @param isBeforeInsertion whether or not this base is before an insertion + * @param isAfterInsertion whether or not this base is after an insertion * @param isNextToSoftClip whether or not this base is next to a soft clipped base * @param nextEventBases bases in event in case element comes before insertion or deletion * @param nextEventLength length of next event in case it's insertion or deletion @@ -48,42 +50,59 @@ public class PileupElement implements Comparable { "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip, - final String nextEventBases, final int nextEventLength) { + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength) { if (offset < 0 && isDeletion) throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); this.read = read; this.offset = offset; this.isDeletion = isDeletion; - this.isBeforeDeletion = isBeforeDeletion; + this.isBeforeDeletedBase = isBeforeDeletion; + this.isAfterDeletedBase = isAfterDeletion; this.isBeforeInsertion = isBeforeInsertion; + this.isAfterInsertion = isAfterInsertion; this.isNextToSoftClip = isNextToSoftClip; if (isBeforeInsertion) eventBases = nextEventBases; else - eventBases = null; // ignore argument in any other case + eventBases = null; // ignore argument in any other case if (isBeforeDeletion || isBeforeInsertion) eventLength = nextEventLength; else eventLength = -1; } - public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) { - this(read,offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null, -1); + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) { + this(read, offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, -1); } public boolean isDeletion() { return isDeletion; } - public boolean isBeforeDeletion() { - return isBeforeDeletion; + public boolean isBeforeDeletedBase() { + return isBeforeDeletedBase; + } + + public boolean isAfterDeletedBase() { + return isAfterDeletedBase; + } + + public boolean isBeforeDeletionStart() { + return isBeforeDeletedBase && !isDeletion; + } + + public boolean isAfterDeletionEnd() { + return isAfterDeletedBase && !isDeletion; } public boolean isBeforeInsertion() { return isBeforeInsertion; } + public boolean isAfterInsertion() { + return isAfterInsertion; + } + public boolean isNextToSoftClip() { return isNextToSoftClip; } @@ -123,14 +142,14 @@ public class PileupElement implements Comparable { } /** - * Returns length of the event (number of inserted or deleted bases + * @return length of the event (number of inserted or deleted bases */ public int getEventLength() { return eventLength; } /** - * Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + * @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. */ public String getEventBases() { return eventBases; @@ -185,13 +204,9 @@ public class PileupElement implements Comparable { // // -------------------------------------------------------------------------- -// public boolean isReducedRead() { -// return read.isReducedRead(); -// } - /** * Returns the number of elements in the pileup element. - *

+ * * Unless this is a reduced read, the number of elements in a pileup element is one. In the event of * this being a reduced read and a deletion, we return the average number of elements between the left * and right elements to the deletion. We assume the deletion to be left aligned. diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index e547534dd..9d1e8ab62 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -96,12 +96,11 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) { + protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, - boolean isNextToSoftClip,String nextEventBases, int nextEventLength) { + protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index ccd9d509f..f15468840 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; /** * A data retrieval interface for accessing parts of the pileup. @@ -159,6 +160,16 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForSamples(Collection sampleNames); + /** + * Gets the particular subset of this pileup for each given sample name. + * + * Same as calling getPileupForSample for all samples, but in O(n) instead of O(n^2). + * + * @param sampleNames Name of the sample to use. + * @return A subset of this pileup containing only reads with the given sample. + */ + public Map getPileupsForSamples(Collection sampleNames); + /** * Gets the particular subset of this pileup with the given sample name. @@ -174,6 +185,20 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public int getNumberOfDeletions(); + /** + * Simple useful routine to count the number of deletion bases in at the next position this pileup + * + * @return + */ + public int getNumberOfDeletionsAfterThisElement(); + + /** + * Simple useful routine to count the number of insertions right after this pileup + * + * @return + */ + public int getNumberOfInsertionsAfterThisElement(); + public int getNumberOfMappingQualityZeroReads(); /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 759d64b2f..a11bc97c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -56,6 +56,9 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup pileup, int size, int nDeletions, int nMQ0Reads) { super(loc, pileup, size, nDeletions, nMQ0Reads); @@ -71,13 +74,14 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup> keysAndTablesMap; // quick access reference to the read group table and its key manager + private ArrayList requestedCovariates = new ArrayList(); // list of all covariates to be used in this calculation - private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation - public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); - public static final String EOF_MARKER = "EOF"; - private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here? - private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values. - public BaseRecalibration( final File RECAL_FILE ) { - // Get a list of all available covariates - final List> classes = new PluginManager(Covariate.class).getPlugins(); - - int lineNumber = 0; - boolean foundAllCovariates = false; - - // Read in the data from the csv file and populate the data map and covariates list - boolean sawEOF = false; - try { - for ( String line : new XReadLines(RECAL_FILE) ) { - lineNumber++; - if ( EOF_MARKER.equals(line) ) { - sawEOF = true; - } else if( COMMENT_PATTERN.matcher(line).matches() ) { - ; // Skip over the comment lines, (which start with '#') - } - // Read in the covariates that were used from the input file - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); - } else { // Found the covariate list in input file, loop through all of them and instantiate them - String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical - boolean foundClass = false; - for( Class covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { - foundClass = true; - try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - - } - } - - if( !foundClass ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); - } - } - } - - } else { // Found a line of data - if( !foundAllCovariates ) { - foundAllCovariates = true; - - // At this point all the covariates should have been found and initialized - if( requestedCovariates.size() < 2 ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); - } - - final boolean createCollapsedTables = true; - - // Initialize any covariate member variables using the shared argument collection - RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - for( Covariate cov : requestedCovariates ) { - cov.initialize( RAC ); - } - // Initialize the data hashMaps - dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); - - } - addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap - } - } - - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); - } catch ( NumberFormatException e ) { - throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); - } - - if ( !sawEOF ) { - final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; - throw new UserException.MalformedFile(RECAL_FILE, errorMessage); - } - - if( dataManager == null ) { - throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); - } - - dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE ); - } - /** - * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) - * @param line A line of CSV data read from the recalibration table data file + * Constructor using a GATK Report file + * + * @param RECAL_FILE a GATK Report file containing the recalibration information + * @param quantizationLevels number of bins to quantize the quality scores */ - private void addCSVData(final File file, final String line) { - final String[] vals = line.split(","); + public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) { + RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); - // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical - throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + - " --Perhaps the read group string contains a comma and isn't being parsed correctly."); - } - - final Object[] key = new Object[requestedCovariates.size()]; - Covariate cov; - int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); - } - final String modelString = vals[iii++]; - final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString); - - // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); - // Add that datum to all the collapsed tables which will be used in the sequential calculation - - dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter + keysAndTablesMap = recalibrationReport.getKeysAndTablesMap(); + requestedCovariates = recalibrationReport.getRequestedCovariates(); + quantizationInfo = recalibrationReport.getQuantizationInfo(); + if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores + quantizationInfo.noQuantization(); + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + quantizationInfo.quantizeQualityScores(quantizationLevels); } - - public void recalibrateRead( final GATKSAMRecord read ) { - //compute all covariate values for this read - RecalDataManager.computeCovariates(read, requestedCovariates); - final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read ); + /** + * This constructor only exists for testing purposes. + * + * @param quantizationInfo the quantization info object + * @param keysAndTablesMap the map of key managers and recalibration tables + * @param requestedCovariates the list of requested covariates + */ + protected BaseRecalibration(QuantizationInfo quantizationInfo, LinkedHashMap> keysAndTablesMap, ArrayList requestedCovariates) { + this.quantizationInfo = quantizationInfo; + this.keysAndTablesMap = keysAndTablesMap; + this.requestedCovariates = requestedCovariates; + } - for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) { - final byte[] originalQuals = read.getBaseQualities( errorModel ); + /** + * Recalibrates the base qualities of a read + * + * It updates the base qualities of the read with the new recalibrated qualities (for all event types) + * + * @param read the read to recalibrate + */ + public void recalibrateRead(final GATKSAMRecord read) { + final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read + for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings + final byte[] originalQuals = read.getBaseQualities(errorModel); final byte[] recalQuals = originalQuals.clone(); - // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { - - final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel); - final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates + for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read + byte qualityScore = originalQuals[offset]; - // BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables? - //Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); - //if( qualityScore == null ) { - final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); - // qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); - //} - + if (qualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model + qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + } recalQuals[offset] = qualityScore; } - - preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low - read.setBaseQualities( recalQuals, errorModel ); + read.setBaseQualities(recalQuals, errorModel); } } + + /** * Implements a serial recalibration of the reads using the combinational table. * First, we perform a positional recalibration, and then a subsequent dinuc correction. * * Given the full recalibration table, we perform the following preprocessing steps: * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) - * @param key The list of Comparables that were calculated from the covariates + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * + * @param key The list of Comparables that were calculated from the covariates + * @param errorModel the event type * @return A recalibrated quality score as a byte */ - private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) { + protected byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) { + final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code"; + final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here"; - final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; + final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]); - // The global quality shift (over the read group only) - readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey )); double globalDeltaQ = 0.0; - if( globalRecalDatum != null ) { - final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); - final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); - globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; - } - - // The shift in quality between reported and empirical - qualityScoreCollapsedKey[0] = key[0]; - qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey )); double deltaQReported = 0.0; - if( qReportedRecalDatum != null ) { - final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); - deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; - } - - // The shift in quality due to each covariate by itself in turn double deltaQCovariates = 0.0; - double deltaQCovariateEmpirical; - covariateCollapsedKey[0] = key[0]; - covariateCollapsedKey[1] = key[1]; - for( int iii = 2; iii < key.length; iii++ ) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey )); - if( covariateRecalDatum != null ) { - deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + + for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = mapEntry.getKey(); + Map table = mapEntry.getValue(); + + switch(keyManager.getRequiredCovariates().size()) { + case 1: // this is the ReadGroup table + List bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to the read group + if (bitKeys.size() > 1) + throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION); + + final RecalDatum empiricalQualRG = table.get(bitKeys.get(0)); + if (empiricalQualRG != null) { + final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); + final double aggregrateQReported = empiricalQualRG.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + break; + case 2: + if (keyManager.getOptionalCovariates().isEmpty()) { // this is the QualityScore table + bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to the reported quality score + if (bitKeys.size() > 1) + throw new ReviewedStingException(TOO_MANY_KEYS_EXCEPTION); + + final RecalDatum empiricalQualQS = table.get(bitKeys.get(0)); + if (empiricalQualQS != null) { + final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + } + else { // this is the table with all the covariates + bitKeys = keyManager.bitSetsFromAllKeys(key, errorModel); // calculate the shift in quality due to each covariate by itself in turn + for (BitSet k : bitKeys) { + final RecalDatum empiricalQualCO = table.get(k); + if (empiricalQualCO != null) { + double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); + deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); + } + } + } + break; + default: + throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION); } } - final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula + recalibratedQual = QualityUtils.boundQual((int) Math.round(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL + + return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } - /** - * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold - * @param originalQuals The list of original base quality scores - * @param recalQuals A list of the new recalibrated quality scores - */ - private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { - for( int iii = 0; iii < recalQuals.length; iii++ ) { - if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter - recalQuals[iii] = originalQuals[iii]; - } - } - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java new file mode 100644 index 000000000..9e20e9afc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.PrintStream; +import java.util.*; + +/** + * A general algorithm for quantizing quality score distributions to use a specific number of levels + * + * Takes a histogram of quality scores and a desired number of levels and produces a + * map from original quality scores -> quantized quality scores. + * + * Note that this data structure is fairly heavy-weight, holding lots of debugging and + * calculation information. If you want to use it efficiently at scale with lots of + * read groups the right way to do this: + * + * Map> map + * for each read group rg: + * hist = getQualHist(rg) + * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) + * map.set(rg, qq.getOriginalToQuantizedMap()) + * + * This map would then be used to look up the appropriate original -> quantized + * quals for each read as it comes in. + * + * @author Mark Depristo + * @since 3/2/12 + */ +public class QualQuantizer { + final private static Set MY_EMPTY_SET = Collections.emptySet(); + + private static Logger logger = Logger.getLogger(QualQuantizer.class); + + /** + * Inputs to the QualQuantizer + */ + final int nLevels, minInterestingQual; + final List nObservationsPerQual; + + /** + * Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28). + * + * Has the same range as nObservationsPerQual + */ + final List originalToQuantizedMap; + + /** Sorted set of qual intervals. + * + * After quantize() this data structure contains only the top-level qual intervals + */ + final TreeSet quantizedIntervals; + + /** + * Protected creator for testng use only + */ + protected QualQuantizer(final int minInterestingQual) { + this.nObservationsPerQual = Collections.emptyList(); + this.nLevels = 0; + this.minInterestingQual = minInterestingQual; + this.quantizedIntervals = null; + this.originalToQuantizedMap = null; + } + + /** + * Creates a QualQuantizer for the histogram that has nLevels + * + * Note this is the only interface to the system. After creating this object + * the map can be obtained via getOriginalToQuantizedMap() + * + * @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that + * this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the + * way up to the largest quality score possible in the reads. OK if the histogram includes many 0 + * count bins, as these are quantized for free. + * @param nLevels the desired number of distinct quality scores to represent the full original range. Must + * be at least 1. + * @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely + * merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and + * quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing + * all data with Q0-Q10. + */ + public QualQuantizer(final List nObservationsPerQual, final int nLevels, final int minInterestingQual) { + this.nObservationsPerQual = nObservationsPerQual; + this.nLevels = nLevels; + this.minInterestingQual = minInterestingQual; + + // some sanity checking + if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedStingException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual)); + if ( nLevels < 0 ) throw new ReviewedStingException("nLevels must be >= 0"); + if ( minInterestingQual < 0 ) throw new ReviewedStingException("minInterestingQual must be >= 0"); + + // actually run the quantizer + this.quantizedIntervals = quantize(); + + // store the map + this.originalToQuantizedMap = intervalsToMap(quantizedIntervals); + } + + /** + * Represents an contiguous interval of quality scores. + * + * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 + */ + @Invariant({ + "qStart <= qEnd", + "qStart >= 0", + "qEnd <= 1000", + "nObservations >= 0", + "nErrors >= 0", + "nErrors <= nObservations", + "fixedQual >= -1 && fixedQual <= QualityUtils.MAX_QUAL_SCORE", + "mergeOrder >= 0"}) + protected final class QualInterval implements Comparable { + final int qStart, qEnd, fixedQual, level; + final long nObservations, nErrors; + final Set subIntervals; + + /** for debugging / visualization. When was this interval created? */ + int mergeOrder; + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) { + this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET); + } + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set subIntervals) { + this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals); + } + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) { + this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET); + } + + @Requires("level >= 0") + public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set subIntervals) { + this.qStart = qStart; + this.qEnd = qEnd; + this.nObservations = nObservations; + this.nErrors = nErrors; + this.fixedQual = fixedQual; + this.level = level; + this.mergeOrder = 0; + this.subIntervals = Collections.unmodifiableSet(subIntervals); + } + + /** + * Human readable name of this interval: e.g., 10-12 + * @return + */ + public String getName() { + return qStart + "-" + qEnd; + } + + @Override + public String toString() { + return "QQ:" + getName(); + } + + /** + * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * @return + */ + @Ensures("result >= 0.0") + public double getErrorRate() { + if ( hasFixedQual() ) + return QualityUtils.qualToErrorProb((byte)fixedQual); + else if ( nObservations == 0 ) + return 0.0; + else + return (nErrors+1) / (1.0 * (nObservations+1)); + } + + /** + * Returns the QUAL of the error rate of this interval, or the fixed + * qual if this interval was created with a fixed qual. + * @return + */ + @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") + public byte getQual() { + if ( ! hasFixedQual() ) + return QualityUtils.probToQual(1-getErrorRate(), 0); + else + return (byte)fixedQual; + } + + /** + * @return true if this bin is using a fixed qual + */ + public boolean hasFixedQual() { + return fixedQual != -1; + } + + @Override + public int compareTo(final QualInterval qualInterval) { + return new Integer(this.qStart).compareTo(qualInterval.qStart); + } + + /** + * Create a interval representing the merge of this interval and toMerge + * + * Errors and observations are combined + * Subintervals updated in order of left to right (determined by qStart) + * Level is 1 + highest level of this and toMerge + * Order must be updated elsewhere + * + * @param toMerge + * @return newly created merged QualInterval + */ + @Requires({"toMerge != null"}) + @Ensures({ + "result != null", + "result.nObservations >= this.nObservations", + "result.nObservations >= toMerge.nObservations", + "result.nErrors >= this.nErrors", + "result.nErrors >= toMerge.nErrors", + "result.qStart == Math.min(this.qStart, toMerge.qStart)", + "result.qEnd == Math.max(this.qEnd, toMerge.qEnd)", + "result.level > Math.max(this.level, toMerge.level)", + "result.subIntervals.size() == 2" + }) + public QualInterval merge(final QualInterval toMerge) { + final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge; + final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; + + if ( left.qEnd + 1 != right.qStart ) + throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right); + + final long nCombinedObs = left.nObservations + right.nObservations; + final long nCombinedErr = left.nErrors + right.nErrors; + + final int level = Math.max(left.level, right.level) + 1; + final Set subIntervals = new HashSet(Arrays.asList(left, right)); + QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); + + return merged; + } + + public double getPenalty() { + return calcPenalty(getErrorRate()); + } + + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subIntervals as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenalty(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( subIntervals.isEmpty() ) { + // this is leave node + if ( this.qEnd <= minInterestingQual ) + // It's free to merge up quality scores below the smallest interesting one + return 0; + else { + return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations; + } + } else { + double sum = 0; + for ( final QualInterval interval : subIntervals ) + sum += interval.calcPenalty(globalErrorRate); + return sum; + } + } + } + + /** + * Main method for computing the quantization intervals. + * + * Invoked in the constructor after all input variables are initialized. Walks + * over the inputs and builds the min. penalty forest of intervals with exactly nLevel + * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed + * to find the optimal combination. + * + * TODO: develop a smarter algorithm + * + * @return the forest of intervals with size == nLevels + */ + @Ensures({"! result.isEmpty()", "result.size() == nLevels"}) + private TreeSet quantize() { + // create intervals for each qual individually + final TreeSet intervals = new TreeSet(); + for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) { + final long nObs = nObservationsPerQual.get(qStart); + final double errorRate = QualityUtils.qualToErrorProb((byte)qStart); + final double nErrors = nObs * errorRate; + final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart); + intervals.add(qi); + } + + // greedy algorithm: + // while ( n intervals >= nLevels ): + // find intervals to merge with least penalty + // merge it + while ( intervals.size() > nLevels ) { + mergeLowestPenaltyIntervals(intervals); + } + + return intervals; + } + + /** + * Helper function that finds and mergest together the lowest penalty pair + * of intervals + * @param intervals + */ + @Requires("! intervals.isEmpty()") + private void mergeLowestPenaltyIntervals(final TreeSet intervals) { + // setup the iterators + final Iterator it1 = intervals.iterator(); + final Iterator it1p = intervals.iterator(); + it1p.next(); // skip one + + // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty + QualInterval minMerge = null; + if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); + int lastMergeOrder = 0; + while ( it1p.hasNext() ) { + final QualInterval left = it1.next(); + final QualInterval right = it1p.next(); + final QualInterval merged = left.merge(right); + lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder); + if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) { + if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge); + minMerge = merged; + } + } + + // now actually go ahead and merge the minMerge pair + if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge); + intervals.removeAll(minMerge.subIntervals); + intervals.add(minMerge); + minMerge.mergeOrder = lastMergeOrder + 1; + if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals); + } + + /** + * Given a final forest of intervals constructs a list mapping + * list.get(i) => quantized qual to use for original quality score i + * + * This function should be called only once to initialize the corresponding + * cached value in this object, as the calculation is a bit costly. + * + * @param intervals + * @return + */ + @Ensures("result.size() == getNQualsInHistogram()") + private List intervalsToMap(final TreeSet intervals) { + final List map = new ArrayList(getNQualsInHistogram()); + map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE)); + for ( final QualInterval interval : intervals ) { + for ( int q = interval.qStart; q <= interval.qEnd; q++ ) { + map.set(q, interval.getQual()); + } + } + + if ( Collections.min(map) == Byte.MIN_VALUE ) + throw new ReviewedStingException("quantized quality score map contains an un-initialized value"); + + return map; + } + + @Ensures("result > 0") + private final int getNQualsInHistogram() { + return nObservationsPerQual.size(); + } + + /** + * Write out a GATKReport to visualize the QualQuantization process of this data + * @param out + */ + public void writeReport(PrintStream out) { + final GATKReport report = new GATKReport(); + + addQualHistogramToReport(report); + addIntervalsToReport(report); + + report.print(out); + } + + private final void addQualHistogramToReport(final GATKReport report) { + report.addTable("QualHistogram", "Quality score histogram provided to report"); + GATKReportTable table = report.getTable("QualHistogram"); + + table.addPrimaryKey("qual"); + table.addColumn("count", "NA"); + + for ( int q = 0; q < nObservationsPerQual.size(); q++ ) { + table.set(q, "count", nObservationsPerQual.get(q)); + } + } + + + private final void addIntervalsToReport(final GATKReport report) { + report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals"); + GATKReportTable table = report.getTable("QualQuantizerIntervals"); + + table.addPrimaryKey("name"); + table.addColumn("qStart", "NA"); + table.addColumn("qEnd", "NA"); + table.addColumn("level", "NA"); + table.addColumn("merge.order", "NA"); + table.addColumn("nErrors", "NA"); + table.addColumn("nObservations", "NA"); + table.addColumn("qual", "NA"); + table.addColumn("penalty", "NA"); + table.addColumn("root.node", "NA"); + //table.addColumn("subintervals", "NA"); + + for ( QualInterval interval : quantizedIntervals) + addIntervalToReport(table, interval, true); + } + + private final void addIntervalToReport(final GATKReportTable table, QualInterval interval, final boolean atRootP) { + final String name = interval.getName(); + table.set(name, "qStart", interval.qStart); + table.set(name, "qEnd", interval.qEnd); + table.set(name, "level", interval.level); + table.set(name, "merge.order", interval.mergeOrder); + table.set(name, "nErrors", interval.nErrors); + table.set(name, "nObservations", interval.nObservations); + table.set(name, "qual", interval.getQual()); + table.set(name, "penalty", String.format("%.1f", interval.getPenalty())); + table.set(name, "root.node", atRootP); + + for ( final QualInterval sub : interval.subIntervals ) + addIntervalToReport(table, sub, false); + } + + public List getOriginalToQuantizedMap() { + return originalToQuantizedMap; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 3b2736418..998045a8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -381,15 +381,19 @@ public class AlignmentUtils { return alignment; } - public static int calcAlignmentByteArrayOffset(final Cigar cigar, PileupElement pileup, final int alignmentStart, final int refLocus) { - int pileupOffset = pileup.getOffset(); + public static int calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus) { + return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isInsertionAtBeginningOfRead(), pileupElement.isDeletion(), alignmentStart, refLocus ); + } + + public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isInsertionAtBeginningOfRead, final boolean isDeletion, final int alignmentStart, final int refLocus) { + int pileupOffset = offset; // Special case for reads starting with insertion - if (pileup.isInsertionAtBeginningOfRead()) + if (isInsertionAtBeginningOfRead) return 0; // Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases - if (pileup.isDeletion()) { + if (isDeletion) { pileupOffset = refLocus - alignmentStart; final CigarElement ce = cigar.getCigarElement(0); if (ce.getOperator() == CigarOperator.S) { @@ -414,7 +418,7 @@ public class AlignmentUtils { break; case D: case N: - if (!pileup.isDeletion()) { + if (!isDeletion) { alignmentPos += elementLength; } else { if (pos + elementLength - 1 >= pileupOffset) { @@ -521,6 +525,17 @@ public class AlignmentUtils { return alignment; } + /** + * Returns true if the read does not belong to a contig, i.e. it's location is GenomeLoc.UNMAPPED. + * NOTE: A read can have a mapped GenomeLoc and still have an unmapped flag! + * + * @param r record + * @return true if read is unmapped to a genome loc + */ + public static boolean isReadGenomeLocUnmapped(final SAMRecord r) { + return SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(r.getReferenceName()); + } + /** * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index b17e325fc..d0211db07 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -233,7 +234,17 @@ public class ArtificialSAMUtils { return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); } + public static GATKSAMRecord createArtificialRead(Cigar cigar) { + int length = cigar.getReadLength(); + byte [] base = {'A'}; + byte [] qual = {30}; + byte [] bases = Utils.arrayFromArrayWithLength(base, length); + byte [] quals = Utils.arrayFromArrayWithLength(qual, length); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString()); + } + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); @@ -323,6 +334,33 @@ public class ArtificialSAMUtils { return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static StingSAMIterator createReadIterator(SAMRecord... reads) { + return createReadIterator(Arrays.asList(reads)); + } + + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static StingSAMIterator createReadIterator(List reads) { + final Iterator iter = reads.iterator(); + return new StingSAMIterator() { + @Override public void close() {} + @Override public Iterator iterator() { return iter; } + @Override public boolean hasNext() { return iter.hasNext(); } + @Override public SAMRecord next() { return iter.next(); } + @Override public void remove() { iter.remove(); } + }; + } + private final static int ranIntInclusive(Random ran, int start, int stop) { final int range = stop - start; return ran.nextInt(range) + start; @@ -361,10 +399,10 @@ public class ArtificialSAMUtils { final GATKSAMRecord left = pair.get(0); final GATKSAMRecord right = pair.get(1); - pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false)); + pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false, false, false)); if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { - pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false)); + pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false, false, false)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index ff7d12f09..df1ff2a0e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -13,6 +13,8 @@ import org.broadinstitute.sting.utils.NGSPlatform; */ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { + public static String LANE_TAG = "LN"; + // the SAMReadGroupRecord data we're caching private String mSample = null; private String mPlatform = null; @@ -79,4 +81,12 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { return mNGSPlatform; } + + public String getLane() { + return this.getAttribute(LANE_TAG); + } + + public void setLane(String lane) { + this.setAttribute(LANE_TAG, lane); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 648dafb81..7d3477a7b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -52,8 +52,8 @@ public class GATKSAMRecord extends BAMRecord { public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end // Base Quality Score Recalibrator specific attribute tags - public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; - public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; + public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions + public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions // the SAMRecord data we're caching private String mReadString = null; @@ -165,7 +165,7 @@ public class GATKSAMRecord extends BAMRecord { /** * Setters and Accessors for base insertion and base deletion quality scores */ - public void setBaseQualities( final byte[] quals, final RecalDataManager.BaseRecalibrationType errorModel ) { + public void setBaseQualities( final byte[] quals, final EventType errorModel ) { switch( errorModel ) { case BASE_SUBSTITUTION: setBaseQualities(quals); @@ -181,7 +181,7 @@ public class GATKSAMRecord extends BAMRecord { } } - public byte[] getBaseQualities( final RecalDataManager.BaseRecalibrationType errorModel ) { + public byte[] getBaseQualities( final EventType errorModel ) { switch( errorModel ) { case BASE_SUBSTITUTION: return getBaseQualities(); @@ -204,7 +204,7 @@ public class GATKSAMRecord extends BAMRecord { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); + setBaseQualities(quals, EventType.BASE_INSERTION); } return quals; } @@ -213,9 +213,9 @@ public class GATKSAMRecord extends BAMRecord { byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) ); if( quals == null ) { quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + setBaseQualities(quals, EventType.BASE_DELETION); } return quals; } @@ -335,10 +335,11 @@ public class GATKSAMRecord extends BAMRecord { /** * Clears all attributes except ReadGroup of the read. */ - public void simplify () { + public GATKSAMRecord simplify () { GATKSAMReadGroupRecord rg = getReadGroup(); this.clearAttributes(); setReadGroup(rg); + return this; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 91389f0bf..4e2fd1446 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; @@ -44,10 +45,15 @@ import java.util.*; * @version 0.1 */ public class ReadUtils { + + private static final String OFFSET_OUT_OF_BOUNDS_EXCEPTION = "Offset cannot be greater than read length %d : %d"; + private static final String OFFSET_NOT_ZERO_EXCEPTION = "We ran past the end of the read and never found the offset, something went wrong!"; + private ReadUtils() { } private static int DEFAULT_ADAPTOR_SIZE = 100; + public static int CLIPPING_GOAL_NOT_REACHED = -1; /** * A marker to tell which end of the read has been clipped @@ -361,7 +367,11 @@ public class ReadUtils { @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { - Pair result = getReadCoordinateForReferenceCoordinate(read, refCoord); + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); + } + + public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { + Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); int readCoord = result.getFirst(); // Corner case one: clipping the right tail and falls on deletion, move to the next @@ -373,9 +383,9 @@ public class ReadUtils { // clipping the left tail and first base is insertion, go to the next read coordinate // with the same reference coordinate. Advance to the next cigar element, or to the // end of the read if there is no next element. - Pair firstElementIsInsertion = readStartsWithInsertion(read); + Pair firstElementIsInsertion = readStartsWithInsertion(cigar); if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst()) - readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1); + readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), cigar.getReadLength() - 1); return readCoord; } @@ -399,14 +409,25 @@ public class ReadUtils { @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); + } + + public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; - int goal = refCoord - read.getSoftStart(); // The goal is to move this many reference bases + int goal = refCoord - alignmentStart; // The goal is to move this many reference bases + if (goal < 0) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + } boolean goalReached = refBases == goal; - Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { CigarElement cigarElement = cigarElementIterator.next(); int shift = 0; @@ -430,8 +451,13 @@ public class ReadUtils { // If it isn't, we need to check the next one. There should *ALWAYS* be a next one // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + if (!endsWithinCigar && !cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + } + } CigarElement nextCigarElement; @@ -446,8 +472,13 @@ public class ReadUtils { // if it's an insertion, we need to clip the whole insertion before looking at the next element if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) - throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + if (!cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + } + } nextCigarElement = cigarElementIterator.next(); } @@ -472,8 +503,13 @@ public class ReadUtils { } } - if (!goalReached) - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + if (!goalReached) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + } return new Pair(readBases, fallsInsideDeletion); } @@ -495,7 +531,7 @@ public class ReadUtils { /** * Is a base inside a read? * - * @param read the read to evaluate + * @param read the read to evaluate * @param referenceCoordinate the reference coordinate of the base to test * @return true if it is inside the read, false otherwise. */ @@ -526,7 +562,11 @@ public class ReadUtils { * @return A pair with the answer (true/false) and the element or null if it doesn't exist */ public static Pair readStartsWithInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + return readStartsWithInsertion(read.getCigar()); + } + + public static Pair readStartsWithInsertion(final Cigar cigar) { + for (CigarElement cigarElement : cigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.INSERTION) return new Pair(true, cigarElement); @@ -541,9 +581,9 @@ public class ReadUtils { * * See getCoverageDistributionOfRead for information on how the coverage is calculated. * - * @param list the list of reads covering the region + * @param list the list of reads covering the region * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { @@ -563,9 +603,9 @@ public class ReadUtils { * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample * reads for variant regions, and deletions count as variants) * - * @param read the read to get the coverage distribution of + * @param read the read to get the coverage distribution of * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) * @return an array with the coverage of each position from startLocation to stopLocation */ public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { @@ -611,9 +651,9 @@ public class ReadUtils { * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. * Example: Read => {true, true, false, ... false} * - * @param readList the list of reads to generate the association mappings + * @param readList the list of reads to generate the association mappings * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) * @return the two hashmaps described above */ public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { @@ -622,7 +662,6 @@ public class ReadUtils { HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); - for (int i = startLocation; i <= stopLocation; i++) locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists @@ -631,7 +670,7 @@ public class ReadUtils { int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); - for (int i=0; i 0) { // Update the hash for this locus @@ -649,6 +688,66 @@ public class ReadUtils { return new Pair>, HashMap>(locusToReadMap, readToLocusMap); } + /** + * Create random read qualities + * + * @param length the length of the read + * @return an array with randomized base qualities between 0 and 50 + */ + public static byte[] createRandomReadQuals(int length) { + Random random = GenomeAnalysisEngine.getRandomGenerator(); + byte[] quals = new byte[length]; + for (int i = 0; i < length; i++) + quals[i] = (byte) random.nextInt(50); + return quals; + } + + /** + * Create random read qualities + * + * @param length the length of the read + * @param allowNs whether or not to allow N's in the read + * @return an array with randomized bases (A-N) with equal probability + */ + public static byte[] createRandomReadBases(int length, boolean allowNs) { + Random random = GenomeAnalysisEngine.getRandomGenerator(); + int numberOfBases = allowNs ? 5 : 4; + byte[] bases = new byte[length]; + for (int i = 0; i < length; i++) { + switch (random.nextInt(numberOfBases)) { + case 0: + bases[i] = 'A'; + break; + case 1: + bases[i] = 'C'; + break; + case 2: + bases[i] = 'G'; + break; + case 3: + bases[i] = 'T'; + break; + case 4: + bases[i] = 'N'; + break; + default: + throw new ReviewedStingException("Something went wrong, this is just impossible"); + } + } + return bases; + } + + public static GATKSAMRecord createRandomRead(int length) { + return createRandomRead(length, true); + } + + public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { + byte[] quals = ReadUtils.createRandomReadQuals(length); + byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); + return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + } + + public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { String[] sequenceRecordNames = new String[sequenceDictionary.size()]; int sequenceRecordIndex = 0; @@ -656,4 +755,71 @@ public class ReadUtils { sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); return Arrays.deepToString(sequenceRecordNames); } + + /** + * Calculates the reference coordinate for a read coordinate + * + * @param read the read + * @param offset the base in the read (coordinate in the read) + * @return the reference coordinate correspondent to this base + */ + public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { + if (offset > read.getReadLength()) + throw new ReviewedStingException(String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); + + long location = read.getAlignmentStart(); + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (offset > 0 && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + long move = 0; + if (cigarElement.getOperator().consumesReferenceBases()) + move = (long) Math.min(cigarElement.getLength(), offset); + location += move; + offset -= move; + } + if (offset > 0 && !cigarElementIterator.hasNext()) + throw new ReviewedStingException(OFFSET_NOT_ZERO_EXCEPTION); + + return location; + } + + /** + * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. + * + * Example: + * D -> 2, 34, 75 + * I -> 55 + * S -> 0, 101 + * H -> 101 + * + * @param read the read + * @return a map with the properties described above. See example + */ + public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { + Map> events = new HashMap>(); + + int position = 0; + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + CigarOperator op = cigarElement.getOperator(); + if (op.consumesReadBases()) { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + for (int i = position; i < cigarElement.getLength(); i++) + list.add(position++); + } + else { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + list.add(position); + } + } + return events; + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index c146bf4d4..a3bc7a75f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -34,9 +34,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.io.IOException; +import java.util.*; +import java.util.regex.Pattern; /** * A collection of convenience methods for working with list files. @@ -54,6 +54,7 @@ public class ListFileUtils { * LIST_FILE_COMMENT_START are ignored. * * @param samFiles The sam files, in string format. + * @param parser Parser * @return a flattened list of the bam files provided */ public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { @@ -63,10 +64,8 @@ public class ListFileUtils { inputFileName = expandFileName(inputFileName); if (inputFileName.toLowerCase().endsWith(".list") ) { try { - for ( String fileName : new XReadLines(new File(inputFileName), true) ) { - if ( fileName.length() > 0 && ! fileName.startsWith(LIST_FILE_COMMENT_START) ) { - unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); - } + for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) { + unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); } } catch( FileNotFoundException ex ) { @@ -91,9 +90,11 @@ public class ListFileUtils { /** * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. * @param RODBindings a text equivale + * @param parser Parser * @return a list of expanded, bound RODs. */ @Deprecated + @SuppressWarnings("unused") // TODO: Who is still using this? External walkers? public static Collection unpackRODBindingsOldStyle(final Collection RODBindings, final ParsingEngine parser) { // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); @@ -112,7 +113,7 @@ public class ListFileUtils { String name = positionalTags.get(0); String type = positionalTags.get(1); - RMDTriplet.RMDStorageType storageType = null; + RMDTriplet.RMDStorageType storageType; if(tags.getValue("storage") != null) storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage")); else if(fileName.toLowerCase().endsWith("stdin")) @@ -129,9 +130,11 @@ public class ListFileUtils { /** * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. * @param RODBindings a text equivale + * @param parser Parser * @return a list of expanded, bound RODs. */ - public static Collection unpackRODBindings(final Collection RODBindings, final ParsingEngine parser) { + @SuppressWarnings("unchecked") + public static Collection unpackRODBindings(final Collection RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) { // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); FeatureManager builderForValidation = new FeatureManager(); @@ -142,7 +145,7 @@ public class ListFileUtils { String name = rodBinding.getName(); String type = rodBinding.getTribbleType(); - RMDTriplet.RMDStorageType storageType = null; + RMDTriplet.RMDStorageType storageType; if(rodBinding.getTags().getValue("storage") != null) storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage")); else if(fileName.toLowerCase().endsWith("stdin")) @@ -184,4 +187,157 @@ public class ListFileUtils { return "/dev/stdin"; return argument; } + + /** + * Returns a new set of values, containing a final set of values expanded from values + *

+ * Each element E of values can either be a literal string or a file ending in .list. + * For each E ending in .list we try to read a file named E from disk, and if possible + * all lines from that file are expanded into unique values. + * + * @param values Original values + * @return entries from values or the files listed in values + */ + public static Set unpackSet(Collection values) { + if (values == null) + throw new NullPointerException("values cannot be null"); + Set unpackedValues = new LinkedHashSet(); + // Let's first go through the list and see if we were given any files. + // We'll add every entry in the file to our set, and treat the entries as + // if they had been specified on the command line. + for (String value : values) { + File file = new File(value); + if (value.toLowerCase().endsWith(".list") && file.exists()) { + try { + unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines()); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + } else { + unpackedValues.add(value); + } + } + return unpackedValues; + } + + /** + * Returns a new set of values including only values listed by filters + *

+ * Each element E of values can either be a literal string or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique names. + *

+ * Filters may also be a file of filters. + * + * @param values Values or files with values + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values or the files listed in values, filtered by filters + */ + public static Set includeMatching(Collection values, Collection filters, boolean exactMatch) { + return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch); + } + + /** + * Converts a type T to a String representation. + * + * @param Type to convert to a String. + */ + public static interface StringConverter { + String convert(T value); + } + + /** + * Returns a new set of values including only values matching filters + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values including only values matching filters + */ + public static Set includeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.add(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.add(value); + } + } + return filteredValues; + } + + /** + * Returns a new set of values excluding any values matching filters. + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values exluding any values matching filters + */ + public static Set excludeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + filteredValues.addAll(values); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.remove(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.remove(value); + } + } + return filteredValues; + } + + private static Collection compilePatterns(Collection filters) { + Collection patterns = new ArrayList(); + for (String filter: filters) { + patterns.add(Pattern.compile(filter)); + } + return patterns; + } + + protected static final StringConverter IDENTITY_STRING_CONVERTER = new StringConverter() { + @Override + public String convert(String value) { + return value; + } + }; } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java index 49e9ddf52..b7fc1bdab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,15 +12,14 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils.text; @@ -48,75 +47,92 @@ import java.util.List; * For the love of god, please use this system for reading lines in a file. */ public class XReadLines implements Iterator, Iterable { - private BufferedReader in; // The stream we're reading from - private String nextline = null; // Return value of next call to next() - private boolean trimWhitespace = true; + private final BufferedReader in; // The stream we're reading from + private String nextLine = null; // Return value of next call to next() + private final boolean trimWhitespace; + private final String commentPrefix; + + public XReadLines(final File filename) throws FileNotFoundException { + this(new FileReader(filename), true, null); + } + + public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException { + this(new FileReader(filename), trimWhitespace, null); + } /** * Creates a new xReadLines object to read lines from filename * - * @param filename - * @throws FileNotFoundException + * @param filename file name + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set + * @throws FileNotFoundException when the file is not found */ - public XReadLines(final File filename, final boolean trimWhitespace) throws FileNotFoundException { - this(new FileReader(filename), trimWhitespace); + public XReadLines(final File filename, final boolean trimWhitespace, final String commentPrefix) throws FileNotFoundException { + this(new FileReader(filename), trimWhitespace, commentPrefix); } - public XReadLines(final File filename) throws FileNotFoundException { - this(filename, true); + public XReadLines(final InputStream inputStream) throws FileNotFoundException { + this(new InputStreamReader(inputStream), true, null); } - /** - * Creates a new xReadLines object to read lines from fileReader - * - * @param fileReader - * @throws FileNotFoundException - */ - public XReadLines(final FileReader fileReader, final boolean trimWhitespace) throws FileNotFoundException { - this(new BufferedReader(fileReader), trimWhitespace); - } - - public XReadLines(final FileReader fileReader) throws FileNotFoundException { - this(fileReader, true); + public XReadLines(final InputStream inputStream, final boolean trimWhitespace) { + this(new InputStreamReader(inputStream), trimWhitespace, null); } /** * Creates a new xReadLines object to read lines from an input stream * - * @param inputStream + * @param inputStream input stream + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set */ - public XReadLines(final InputStream inputStream, final boolean trimWhitespace) { - this(new BufferedReader(new InputStreamReader(inputStream)), trimWhitespace); - } - - public XReadLines(final InputStream inputStream) throws FileNotFoundException { - this(inputStream, true); + public XReadLines(final InputStream inputStream, final boolean trimWhitespace, final String commentPrefix) { + this(new InputStreamReader(inputStream), trimWhitespace, commentPrefix); } /** - * Creates a new xReadLines object to read lines from an bufferedReader + * Creates a new xReadLines object to read lines from a reader * - * @param reader + * @param reader reader + */ + public XReadLines(final Reader reader) { + this(reader, true, null); + } + + /** + * Creates a new xReadLines object to read lines from an reader + * + * @param reader reader + * @param trimWhitespace trim whitespace */ public XReadLines(final Reader reader, final boolean trimWhitespace) { + this(reader, trimWhitespace, null); + } + + /** + * Creates a new xReadLines object to read lines from an bufferedReader + * + * @param reader file name + * @param trimWhitespace trim whitespace + * @param commentPrefix prefix for comments or null if no prefix is set + */ + public XReadLines(final Reader reader, final boolean trimWhitespace, final String commentPrefix) { + this.in = (reader instanceof BufferedReader) ? (BufferedReader)reader : new BufferedReader(reader); + this.trimWhitespace = trimWhitespace; + this.commentPrefix = commentPrefix; try { - this.in = new BufferedReader(reader); - nextline = readNextLine(); - this.trimWhitespace = trimWhitespace; + this.nextLine = readNextLine(); } catch(IOException e) { throw new IllegalArgumentException(e); } } - public XReadLines(final Reader reader) { - this(reader, true); - } - /** * Reads all of the lines in the file, and returns them as a list of strings * - * @return + * @return all of the lines in the file. */ public List readLines() { List lines = new LinkedList(); @@ -128,38 +144,48 @@ public class XReadLines implements Iterator, Iterable { /** * I'm an iterator too... - * @return + * @return an iterator */ public Iterator iterator() { return this; } public boolean hasNext() { - return nextline != null; + return this.nextLine != null; } /** - * Actually reads the next line from the stream, not accessible publically - * @return + * Actually reads the next line from the stream, not accessible publicly + * @return the next line or null + * @throws IOException if an error occurs */ private String readNextLine() throws IOException { - String nextline = in.readLine(); // Read another line - if (nextline != null && trimWhitespace ) - nextline = nextline.trim(); - return nextline; + String nextLine; + while ((nextLine = this.in.readLine()) != null) { + if (this.trimWhitespace) { + nextLine = nextLine.trim(); + if (nextLine.length() == 0) + continue; + } + if (this.commentPrefix != null) + if (nextLine.startsWith(this.commentPrefix)) + continue; + break; + } + return nextLine; } /** - * Returns the next line (minus whitespace) - * @return + * Returns the next line (optionally minus whitespace) + * @return the next line */ public String next() { try { - String result = nextline; - nextline = readNextLine(); + String result = this.nextLine; + this.nextLine = readNextLine(); // If we haven't reached EOF yet - if (nextline == null) { + if (this.nextLine == null) { in.close(); // And close on EOF } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index cc14585ac..3bebac4fa 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -28,6 +28,7 @@ import org.broad.tribble.TribbleException; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.EnumMap; @@ -149,8 +150,12 @@ public class GenotypeLikelihoods { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); double[] likelihoodsAsVector = new double[strings.length]; - for ( int i = 0; i < strings.length; i++ ) { - likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0; + try { + for ( int i = 0; i < strings.length; i++ ) { + likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0; + } + } catch (NumberFormatException e) { + throw new UserException.MalformedVCF("The GL/PL tag contains non-integer values: " + likelihoodsAsString_PLs); } return likelihoodsAsVector; } else @@ -223,15 +228,15 @@ public class GenotypeLikelihoods { /** * The maximum number of alleles that we can represent as genotype likelihoods */ - public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 500; + public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50; /* * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles */ - private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); // start with data for 10 alternate alleles + private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { - final int numLikelihoods = calculateNumLikelihoods(altAlleles); + final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2); final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods]; // for all possible combinations of 2 alleles @@ -249,13 +254,50 @@ public class GenotypeLikelihoods { return cache; } - - // how many likelihoods are associated with the given number of alternate alleles? - public static int calculateNumLikelihoods(final int numAltAlleles) { - int numLikelihoods = 1; - for ( int i = 1; i <= numAltAlleles; i++ ) - numLikelihoods += i + 1; - return numLikelihoods; + + /** + * Compute how many likelihood elements are associated with the given number of alleles + * Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P) + * where P = ploidy (number of chromosomes) and N = total # of alleles. + * Each chromosome can be in one single state (0,...,N-1) and there are P of them. + * Naive solution would be to store N*P likelihoods, but this is not necessary because we can't distinguish chromosome states, but rather + * only total number of alt allele counts in all chromosomes. + * + * For example, S(3,2) = 6: For alleles A,B,C, on a diploid organism we have six possible genotypes: + * AA,AB,BB,AC,BC,CC. + * Another way of expressing is with vector (#of A alleles, # of B alleles, # of C alleles) + * which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2) + * In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2 + * + * Recursive implementation: + * S(N,P) = sum_{k=0}^P S(N-1,P-k) + * because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K + * With initial conditions + * S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors) + * S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself). + * + * @param numAlleles Number of alleles (including ref) + * @param ploidy Ploidy, or number of chromosomes in set + * @return Number of likelihood elements we need to hold. + */ + + public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { + + // fast, closed form solution for diploid samples (most common use case) + if (ploidy==2) + return numAlleles*(numAlleles+1)/2; + + if (numAlleles == 1) + return 1; + else if (ploidy == 1) + return numAlleles; + + int acc =0; + for (int k=0; k <= ploidy; k++ ) + acc += calculateNumLikelihoods(numAlleles-1, ploidy-k); + + return acc; + } // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. @@ -289,11 +331,11 @@ public class GenotypeLikelihoods { * ordering and I know with certainty that external users have built code on top of it; changing it now would * cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method. * This method assumes at most 3 alternate alleles. - * TODO -- address this issue at the source by updating DiploidSNPGenotypeLikelihoods. * * @param PLindex the PL index * @return the allele index pair */ + @Deprecated public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) { return getAllelePair(PLindexConversion[PLindex]); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index e2e46cbe9..3faad46e2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -160,7 +160,7 @@ import java.util.*; * * @author depristo */ -public class VariantContext implements Feature { // to enable tribble intergration +public class VariantContext implements Feature { // to enable tribble integration protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -377,7 +377,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * * Not currently supported: * - * Heterozygous sequencea + * Heterozygous sequence * The term heterozygous is used to specify a region detected by certain methods that do not * resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking * sequence must be provided to define a sequence context for the variation. @@ -592,15 +592,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati /** * @return True if this context contains Allele allele, or false otherwise */ - public boolean hasAllele(Allele allele) { - return hasAllele(allele, false); + public boolean hasAllele(final Allele allele) { + return hasAllele(allele, false, true); } - public boolean hasAllele(Allele allele, boolean ignoreRefState) { - if ( allele == REF || allele == ALT ) // optimization for cached cases + public boolean hasAllele(final Allele allele, final boolean ignoreRefState) { + return hasAllele(allele, ignoreRefState, true); + } + + public boolean hasAlternateAllele(final Allele allele) { + return hasAllele(allele, false, false); + } + + public boolean hasAlternateAllele(final Allele allele, final boolean ignoreRefState) { + return hasAllele(allele, ignoreRefState, false); + } + + private boolean hasAllele(final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele) { + if ( (considerRefAllele && allele == REF) || allele == ALT ) // optimization for cached cases return true; - for ( Allele a : getAlleles() ) { + final List allelesToConsider = considerRefAllele ? getAlleles() : getAlternateAlleles(); + for ( Allele a : allelesToConsider ) { if ( a.equals(allele, ignoreRefState) ) return true; } @@ -656,12 +669,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati return alleles.get(i+1); } + /** + * @param other VariantContext whose alleles to compare against + * @return true if this VariantContext has the same alleles (both ref and alts) as other, + * regardless of ordering. Otherwise returns false. + */ + public boolean hasSameAllelesAs ( final VariantContext other ) { + return hasSameAlternateAllelesAs(other) && other.getReference().equals(getReference(), false); + } + /** * @param other VariantContext whose alternate alleles to compare against * @return true if this VariantContext has the same alternate alleles as other, * regardless of ordering. Otherwise returns false. */ - public boolean hasSameAlternateAllelesAs ( VariantContext other ) { + public boolean hasSameAlternateAllelesAs ( final VariantContext other ) { List thisAlternateAlleles = getAlternateAlleles(); List otherAlternateAlleles = other.getAlternateAlleles(); @@ -783,11 +805,22 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return chromosome count */ public int getCalledChrCount() { - int n = 0; + return getCalledChrCount(new HashSet(0)); + } - for ( final Genotype g : getGenotypes() ) { - for ( final Allele a : g.getAlleles() ) - n += a.isNoCall() ? 0 : 1; + /** + * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS) + * + * @param sampleIds IDs of samples to take into account. If empty then all samples are included. + * @return chromosome count + */ + public int getCalledChrCount(Set sampleIds) { + int n = 0; + GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds); + + for ( final Genotype g : genotypes) { + for ( final Allele a : g.getAlleles() ) + n += a.isNoCall() ? 0 : 1; } return n; @@ -800,10 +833,22 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return chromosome count */ public int getCalledChrCount(Allele a) { - int n = 0; + return getCalledChrCount(a,new HashSet(0)); + } - for ( final Genotype g : getGenotypes() ) { - n += g.getAlleles(a).size(); + /** + * Returns the number of chromosomes carrying allele A in the genotypes + * + * @param a allele + * @param sampleIds - IDs of samples to take into account. If empty then all samples are included. + * @return chromosome count + */ + public int getCalledChrCount(Allele a, Set sampleIds) { + int n = 0; + GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds); + + for ( final Genotype g : genotypes ) { + n += g.getAlleles(a).size(); } return n; @@ -877,6 +922,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return number of hom var calls */ public int getHomVarCount() { + calculateGenotypeCounts(); return genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]; } @@ -886,6 +932,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return number of mixed calls */ public int getMixedCount() { + calculateGenotypeCounts(); return genotypeCounts[Genotype.Type.MIXED.ordinal()]; } @@ -1031,7 +1078,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati } private void validateReferencePadding() { - if (hasSymbolicAlleles()) // symbolic alleles don't need padding... + if ( hasSymbolicAlleles() ) // symbolic alleles don't need padding... return; boolean needsPadding = (getReference().length() == getEnd() - getStart()); // off by one because padded base was removed @@ -1069,7 +1116,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati // if ( getReference().length() != (getLocation().size()-1) ) { long length = (stop - start) + 1; if ( (getReference().isNull() && length != 1 ) || - (getReference().isNonNull() && (length - getReference().length() > 1))) { + (!isSymbolic() && getReference().isNonNull() && (length - getReference().length() > 1))) { throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 4e16db482..ff66162c8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -28,6 +28,7 @@ import com.google.java.contract.*; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -344,6 +345,21 @@ public class VariantContextBuilder { return this; } + /** + * Tells us that the resulting VariantContext should have the specified location + * @param loc + * @return + */ + @Requires({"loc.getContig() != null", "loc.getStart() >= 0", "loc.getStop() >= 0"}) + public VariantContextBuilder loc(final GenomeLoc loc) { + this.contig = loc.getContig(); + this.start = loc.getStart(); + this.stop = loc.getStop(); + toValidate.add(VariantContext.Validation.ALLELES); + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + /** * Tells us that the resulting VariantContext should have the specified contig chr * @param contig diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index fc50df3a5..92915faaf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -30,10 +30,7 @@ import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -50,6 +47,8 @@ public class VariantContextUtils { public final static String MERGE_FILTER_PREFIX = "filterIn"; final public static JexlEngine engine = new JexlEngine(); + public static final int DEFAULT_PLOIDY = 2; + static { engine.setSilent(false); // will throw errors now for selects that don't evaluate properly engine.setLenient(false); @@ -65,6 +64,21 @@ public class VariantContextUtils { * @return the attributes map provided as input, returned for programming convenience */ public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { + return calculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet(0)); + } + + /** + * Update the attributes of the attributes map given the VariantContext to reflect the + * proper chromosome-based VCF tags + * + * @param vc the VariantContext + * @param attributes the attributes map to populate; must not be null; may contain old values + * @param removeStaleValues should we remove stale values from the mapping? + * @param founderIds - Set of founders Ids to take into account. AF and FC will be calculated over the founders. + * If empty or null, counts are generated for all samples as unrelated individuals + * @return the attributes map provided as input, returned for programming convenience + */ + public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues, final Set founderIds) { final int AN = vc.getCalledChrCount(); // if everyone is a no-call, remove the old attributes if requested @@ -83,16 +97,20 @@ public class VariantContextUtils { // if there are alternate alleles, record the relevant tags if ( vc.getAlternateAlleles().size() > 0 ) { - final ArrayList alleleFreqs = new ArrayList(); - final ArrayList alleleCounts = new ArrayList(); + ArrayList alleleFreqs = new ArrayList(); + ArrayList alleleCounts = new ArrayList(); + ArrayList foundersAlleleCounts = new ArrayList(); + double totalFoundersChromosomes = (double)vc.getCalledChrCount(founderIds); + int foundersAltChromosomes; for ( Allele allele : vc.getAlternateAlleles() ) { - int altChromosomes = vc.getCalledChrCount(allele); - alleleCounts.add(altChromosomes); + foundersAltChromosomes = vc.getCalledChrCount(allele,founderIds); + alleleCounts.add(vc.getCalledChrCount(allele)); + foundersAlleleCounts.add(foundersAltChromosomes); if ( AN == 0 ) { alleleFreqs.add("0.0"); } else { // todo -- this is a performance problem - final String freq = String.format(makePrecisionFormatStringFromDenominatorValue((double)AN), ((double)altChromosomes / (double)AN)); + final String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalFoundersChromosomes), ((double)foundersAltChromosomes / totalFoundersChromosomes)); alleleFreqs.add(freq); } } @@ -117,12 +135,25 @@ public class VariantContextUtils { * @param removeStaleValues should we remove stale values from the mapping? */ public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) { - final VariantContext vc = builder.make(); - final Map attrs = calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues); - builder.attributes(attrs); + VariantContext vc = builder.make(); + builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, new HashSet(0))); } - private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { + /** + * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper + * chromosome-based VCF tags based on the current VC produced by builder.make() + * + * @param builder the VariantContextBuilder we are updating + * @param founderIds - Set of founders to take into account. AF and FC will be calculated over the founders only. + * If empty or null, counts are generated for all samples as unrelated individuals + * @param removeStaleValues should we remove stale values from the mapping? + */ + public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues, final Set founderIds) { + VariantContext vc = builder.make(); + builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); + } + + public static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { int precision = 1; while ( maxValue > 1 ) { @@ -142,22 +173,22 @@ public class VariantContextUtils { public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { // see if we need to pad common reference base from all alleles - boolean padVC; + boolean padVC = false; // We need to pad a VC with a common base if the length of the reference allele is less than the length of the VariantContext. // This happens because the position of e.g. an indel is always one before the actual event (as per VCF convention). - long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; - if (inputVC.hasSymbolicAlleles()) - padVC = true; - else if (inputVC.getReference().length() == locLength) + final int recordLength = inputVC.getEnd() - inputVC.getStart() + 1; + final int referenceLength = inputVC.getReference().length(); + if ( referenceLength == recordLength ) padVC = false; - else if (inputVC.getReference().length() == locLength-1) + else if ( referenceLength == recordLength - 1 ) padVC = true; - else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + + else if ( !inputVC.hasSymbolicAlleles() ) + throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); // nothing to do if we don't need to pad bases - if (padVC) { + if ( padVC ) { if ( !inputVC.hasReferenceBaseForIndel() ) throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); @@ -458,7 +489,7 @@ public class VariantContextUtils { /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * * @param genomeLocParser loc parser @@ -492,11 +523,11 @@ public class VariantContextUtils { if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) verifyUniqueSampleNames(unsortedVCs); - List prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + final List prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary - List VCs = new ArrayList(); + final List VCs = new ArrayList(); - for (VariantContext vc : prepaddedVCs) { + for (final VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); @@ -508,6 +539,7 @@ public class VariantContextUtils { final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); + Byte referenceBaseForIndel = null; final Set alleles = new LinkedHashSet(); final Set filters = new TreeSet(); @@ -531,8 +563,8 @@ public class VariantContextUtils { // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( VariantContext vc : VCs ) { - if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) ) + for ( final VariantContext vc : VCs ) { + if ( loc.getStart() != vc.getStart() ) throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); if ( getLocation(genomeLocParser,vc).size() > loc.size() ) @@ -552,6 +584,9 @@ public class VariantContextUtils { filters.addAll(vc.getFilters()); + if ( referenceBaseForIndel == null ) + referenceBaseForIndel = vc.getReferenceBaseForIndel(); + // // add attributes // @@ -581,13 +616,13 @@ public class VariantContextUtils { } } - for (Map.Entry p : vc.getAttributes().entrySet()) { + for (final Map.Entry p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if ( ! inconsistentAttributes.contains(key) ) { - boolean alreadyFound = attributes.containsKey(key); - Object boundValue = attributes.get(key); - boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { // we found the value but we're inconsistent, put it in the exclude list @@ -604,12 +639,12 @@ public class VariantContextUtils { // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF - for ( VariantContext vc : VCs ) { + for ( final VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { if ( ! genotypes.isEmpty() ) - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc @@ -634,11 +669,11 @@ public class VariantContextUtils { setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference + else if ( variantSources.isEmpty() ) // everyone was reference setValue = MERGE_REF_IN_ALL; else { - LinkedHashSet s = new LinkedHashSet(); - for ( VariantContext vc : VCs ) + final LinkedHashSet s = new LinkedHashSet(); + for ( final VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); setValue = Utils.join("-", s); @@ -661,9 +696,10 @@ public class VariantContextUtils { builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); + builder.referenceBaseForIndel(referenceBaseForIndel); // Trim the padded bases of all alleles if necessary - VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); + final VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } @@ -710,8 +746,7 @@ public class VariantContextUtils { else if (refAllele.isNull()) trimVC = false; else { - trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); + trimVC = (AbstractVCFCodec.computeForwardClipping(inputVC.getAlternateAlleles(), (byte)inputVC.getReference().getDisplayString().charAt(0)) > 0); } // nothing to do if we don't need to trim bases @@ -719,12 +754,9 @@ public class VariantContextUtils { List alleles = new ArrayList(); GenotypesContext genotypes = GenotypesContext.create(); - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - Map originalToTrimmedAlleleMap = new HashMap(); - for (Allele a : inputVC.getAlleles()) { + for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); @@ -741,11 +773,9 @@ public class VariantContextUtils { // example: mixed records such as {TA*,TGA,TG} boolean hasNullAlleles = false; - for (Allele a: originalToTrimmedAlleleMap.values()) { + for (final Allele a: originalToTrimmedAlleleMap.values()) { if (a.isNull()) hasNullAlleles = true; - if (a.isReference()) - refAllele = a; } if (!hasNullAlleles) @@ -755,7 +785,7 @@ public class VariantContextUtils { List originalAlleles = genotype.getAlleles(); List trimmedAlleles = new ArrayList(); - for ( Allele a : originalAlleles ) { + for ( final Allele a : originalAlleles ) { if ( a.isCalled() ) trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); else @@ -766,12 +796,52 @@ public class VariantContextUtils { } final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - return builder.alleles(alleles).genotypes(genotypes).attributes(attributes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); + return builder.alleles(alleles).genotypes(genotypes).referenceBaseForIndel(new Byte(inputVC.getReference().getBases()[0])).make(); } return inputVC; } + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + // see if we need to trim common reference base from all alleles + + final int trimExtent = AbstractVCFCodec.computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, true, -1); + if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) + return inputVC; + + final List alleles = new ArrayList(); + final GenotypesContext genotypes = GenotypesContext.create(); + final Map originalToTrimmedAlleleMap = new HashMap(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + for ( final Genotype genotype : inputVC.getGenotypes() ) { + final List originalAlleles = genotype.getAlleles(); + final List trimmedAlleles = new ArrayList(); + for ( final Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles)); + } + + return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make(); + } + public static GenotypesContext stripPLs(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); @@ -837,7 +907,6 @@ public class VariantContextUtils { public AlleleMapper(Map map) { this.map = map; } public boolean needsRemapping() { return this.map != null; } public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List remap(List as) { @@ -1069,4 +1138,212 @@ public class VariantContextUtils { names.add(g.getSampleName()); return names; } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param vc variant context with genotype likelihoods + * @return genotypes context + */ + public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { + return subsetDiploidAlleles(vc, vc.getAlleles(), true); + } + + private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + + /** + * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs + * @return genotypes + */ + public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes) { + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + // which PLs should be carried forward? + ArrayList likelihoodIndexesToUse = null; + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { + likelihoodIndexesToUse = new ArrayList(30); + + final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) + altAlleleIndexToUse[i] = true; + } + + // calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY); + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) + likelihoodIndexesToUse.add(PLindex); + } + } + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + } + else { + Map attrs = new HashMap(g.getAttributes()); + if ( numNewAltAlleles == 0 ) + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + else + attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods)); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) + newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false)); + else + newGTs.add(assignDiploidGenotype(g, newLikelihoods, allelesToUse, attrs)); + } + } + + return newGTs; + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param originalGT the original genotype + * @param newLikelihoods the PL array + * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) + * @param attrs the annotations to use when creating the genotype + * + * @return genotype + */ + private static Genotype assignDiploidGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final Map attrs) { + final int numNewAltAlleles = allelesToUse.size() - 1; + + // find the genotype with maximum likelihoods + int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + ArrayList myAlleles = new ArrayList(); + myAlleles.add(allelesToUse.get(alleles.alleleIndex1)); + myAlleles.add(allelesToUse.get(alleles.alleleIndex2)); + + final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); + return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); + } + + /** + * Returns true iff VC is an non-complex indel where every allele represents an expansion or + * contraction of a series of identical bases in the reference. + * + * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT + * + * If VC = -/CT, then this function returns true because the CT insertion matches exactly the + * upcoming reference. + * If VC = -/CTA then this function returns false because the CTA isn't a perfect match + * + * Now consider deletions: + * + * If VC = CT/- then again the same logic applies and this returns true + * The case of CTA/- makes no sense because it doesn't actually match the reference bases. + * + * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For + * each insertion allele of n bases, check if that allele matches the next n reference bases. + * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, + * as it must necessarily match the first n bases. If this test returns true for all + * alleles you are a tandem repeat, otherwise you are not. + * + * @param vc + * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return false; + + final Allele ref = vc.getReference(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) + return false; + } + + // we've passed all of the tests, so we are a repeat + return true; + } + + /** + * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference + * @param ref + * @param alt + * @param refBasesStartingAtVCWithoutPad + * @return + */ + protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { + if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) + return false; // we require one allele be a prefix of another + + if ( ref.length() > alt.length() ) { // we are a deletion + return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); + } else { // we are an insertion + return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); + } + } + + protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { + final String potentialRepeat = l.substring(s.length()); // skip s bases + + for ( int i = 0; i < minNumberOfMatches; i++) { + final int start = i * potentialRepeat.length(); + final int end = (i+1) * potentialRepeat.length(); + if ( ref.length() < end ) + return false; // we ran out of bases to test + final String refSub = ref.substring(start, end); + if ( ! refSub.equals(potentialRepeat) ) + return false; // repeat didn't match, fail + } + + return true; // we passed all tests, we matched + } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index e33f6717a..c49adf805 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -84,6 +84,7 @@ public abstract class BaseTest { public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; + public static final boolean REQUIRE_NETWORK_CONNECTION = true; public static final String networkTempDir; public static final File networkTempDirFile; @@ -108,15 +109,20 @@ public abstract class BaseTest { // Set the Root logger to only output warnings. logger.setLevel(Level.WARN); - networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File("/broad/shptmp/" + System.getProperty("user.name"))); - networkTempDirFile.deleteOnExit(); - networkTempDir = networkTempDirFile.getAbsolutePath() + "/"; + if ( REQUIRE_NETWORK_CONNECTION ) { + networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File("/broad/shptmp/" + System.getProperty("user.name"))); + networkTempDirFile.deleteOnExit(); + networkTempDir = networkTempDirFile.getAbsolutePath() + "/"; - // find our file sources -// if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { -// logger.fatal("We can't locate the reference directories. Aborting!"); -// throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); -// } + // find our file sources + if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { + logger.fatal("We can't locate the reference directories. Aborting!"); + throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); + } + } else { + networkTempDir = null; + networkTempDirFile = null; + } } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 20f3e1e35..477b76e37 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -76,7 +76,6 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), Collections.emptyList(), false, - false, BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java new file mode 100644 index 000000000..1a5e99915 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class IntervalOverlapFilteringIteratorUnitTest { + + private SAMFileHeader header; + private GenomeLoc firstContig; + private GenomeLoc secondContig; + + /** Basic aligned and mapped read. */ + private SAMRecord readMapped; + + /** Read with no contig specified in the read, -L UNMAPPED */ + private SAMRecord readNoReference; + + /** This read has a start position, but is flagged that it's not mapped. */ + private SAMRecord readUnmappedFlag; + + /** This read is from the second contig. */ + private SAMRecord readSecondContig; + + /** This read says it's aligned, but actually has an unknown start. */ + private SAMRecord readUnknownStart; + + /** The above reads in the order one would expect to find them in a sorted BAM. */ + private List testReads; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, ArtificialSAMUtils.DEFAULT_READ_LENGTH * 2); + GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + SAMSequenceRecord record; + + record = header.getSequence(0); + firstContig = genomeLocParser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength()); + record = header.getSequence(1); + secondContig = genomeLocParser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength()); + + readMapped = createMappedRead("mapped", 1); + + readUnmappedFlag = createMappedRead("unmappedFlagged", 2); + readUnmappedFlag.setReadUnmappedFlag(true); + + readSecondContig = createMappedRead("secondContig", 3); + readSecondContig.setReferenceName(secondContig.getContig()); + + /* This read says it's aligned, but to a contig not in the header. */ + SAMRecord readUnknownContig = createMappedRead("unknownContig", 4); + readUnknownContig.setReferenceName("unknownContig"); + + readUnknownStart = createMappedRead("unknownStart", 1); + readUnknownStart.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + + readNoReference = createUnmappedRead("unmappedNoReference"); + + testReads = new ArrayList(); + testReads.add(readMapped); + testReads.add(readUnmappedFlag); + testReads.add(readUnknownStart); + testReads.add(readSecondContig); + testReads.add(readUnknownContig); + testReads.add(readNoReference); + } + + @DataProvider(name = "filteringIteratorTestData") + public Object[][] getFilteringIteratorTestData() { + return new Object[][] { + new Object[] {Arrays.asList(firstContig), Arrays.asList(readMapped, readUnmappedFlag, readUnknownStart)}, + new Object[] {Arrays.asList(GenomeLoc.UNMAPPED), Arrays.asList(readNoReference)}, + new Object[] {Arrays.asList(firstContig, secondContig), Arrays.asList(readMapped, readUnmappedFlag, readUnknownStart, readSecondContig)} + }; + } + + @Test(dataProvider = "filteringIteratorTestData") + public void testFilteringIterator(List locs, List expected) { + IntervalOverlapFilteringIterator filterIter = new IntervalOverlapFilteringIterator( + ArtificialSAMUtils.createReadIterator(testReads), locs); + + List actual = new ArrayList(); + while (filterIter.hasNext()) { + actual.add(filterIter.next()); + } + Assert.assertEquals(actual, expected); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testMappedAndUnmapped() { + new IntervalOverlapFilteringIterator( + ArtificialSAMUtils.createReadIterator(testReads), + Arrays.asList(firstContig, GenomeLoc.UNMAPPED)); + } + + private SAMRecord createUnmappedRead(String name) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } + + private SAMRecord createMappedRead(String name, int start) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + 0, + start, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index ba2d68ec9..1c5dab254 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -109,7 +109,6 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), - false, false); Iterable strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java new file mode 100644 index 000000000..333d35641 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * Checks that the Bad Cigar filter works for all kinds of wonky cigars + * + * @author Mauricio Carneiro + * @since 3/20/12 + */ +public class BadCigarFilterUnitTest { + + BadCigarFilter filter; + + @BeforeClass + public void init() { + filter = new BadCigarFilter(); + } + + @Test + public void testWonkyCigars () { + byte[] bases = {'A', 'A', 'A', 'A'}; + byte[] quals = {30, 30, 30, 30}; + GATKSAMRecord read; + // starting with multiple deletions + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M"); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + + read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 7282d6c48..50a4ce607 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -43,49 +43,6 @@ public class LocusIteratorByStateUnitTest extends BaseTest { return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); } - @Test - public void testIndelBaseQualityFiltering() { - final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - - SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); - before.setReadBases(bases); - before.setBaseQualities(new byte[] {20,20,20,20,0,20,20,20,20,20}); - before.setCigarString("10M"); - - SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); - during.setReadBases(bases); - during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20}); - during.setCigarString("4M1I6M"); - - SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); - after.setReadBases(bases); - after.setBaseQualities(new byte[] {20,20,0,20,20,20,20,20,20,20}); - after.setCigarString("10M"); - - List reads = Arrays.asList(before,during,after); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - boolean foundExtendedEventPileup = false; - while (li.hasNext()) { - AlignmentContext context = li.next(); - if(!context.hasExtendedEventPileup()) - continue; - - ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getBaseFilteredPileup(10); - Assert.assertEquals(pileup.getLocation().getStart(), 5, "Extended event pileup at wrong location"); - Assert.assertEquals(pileup.getNumberOfElements(), 3, "Pileup size is incorrect"); - - foundExtendedEventPileup = true; - } - - Assert.assertTrue(foundExtendedEventPileup,"Extended event pileup not found"); - } @Test public void testIndelsInRegularPileup() { final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -93,7 +50,6 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // create a test version of the Reads object ReadProperties readAttributes = createTestReadProperties(); - JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); before.setReadBases(bases); @@ -136,59 +92,6 @@ public class LocusIteratorByStateUnitTest extends BaseTest { Assert.assertTrue(foundIndel,"Indel in pileup not found"); } - /** - * Right now, the GATK's extended event pileup DOES NOT include reads which stop immediately before an insertion - * but DOES include reads which stop immediately after an insertion. This is almost certainly WRONG. Eric is - * figuring out the right way to handle this; in the meantime, adding this test to monitor that: - * A) the behavior is consistent - * B) so that we do end up with an automated test for this case when the model is fixed. - */ - @Test - public void testIndelPileupContainsAbuttingReads() { - final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; - final byte[] quals = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - - SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); - before.setReadBases(bases); - before.setBaseQualities(quals); - before.setCigarString("10M"); - - SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,6,10); - during.setReadBases(bases); - during.setBaseQualities(quals); - during.setCigarString("5M1I5M"); - - SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,11,10); - after.setReadBases(bases); - after.setBaseQualities(quals); - after.setCigarString("10M"); - - List reads = Arrays.asList(before,during,after); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - boolean foundExtendedEventPileup = false; - while (li.hasNext()) { - AlignmentContext context = li.next(); - if(!context.hasExtendedEventPileup()) - continue; - - Assert.assertEquals(context.getLocation().getStart(), 10, "Extended event pileup at wrong location"); - Assert.assertEquals(context.size(), 2, "Pileup size is incorrect"); - Assert.assertEquals(context.getExtendedEventPileup().getReads().get(0), during, "Read in pileup is incorrect"); - Assert.assertEquals(context.getExtendedEventPileup().getReads().get(1), after, "Read in pileup is incorrect"); - - foundExtendedEventPileup = true; - } - - Assert.assertTrue(foundExtendedEventPileup,"Extended event pileup not found"); - } - @Test public void testWholeIndelReadInIsolation() { final int firstLocus = 44367789; @@ -214,17 +117,6 @@ public class LocusIteratorByStateUnitTest extends BaseTest { ReadBackedPileup basePileup = alignmentContext.getBasePileup(); Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); Assert.assertSame(basePileup.getReads().get(0),indelOnlyRead,"Read in pileup is incorrect"); - - // Turn on extended events, and make sure the event is found. - JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - li = makeLTBS(reads, readAttributes); - - Assert.assertTrue(li.hasNext(),"LocusIteratorByState with extended events should contain exactly one pileup"); - alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus-1,"Extended event pileup is at incorrect location."); - ReadBackedExtendedEventPileup extendedEventPileup = alignmentContext.getExtendedEventPileup(); - Assert.assertEquals(extendedEventPileup.getReads().size(),1,"Pileup is of incorrect size"); - Assert.assertSame(extendedEventPileup.getReads().get(0),indelOnlyRead,"Read in pileup is incorrect"); } /** @@ -232,7 +124,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { * not negatively influence the ordering of the pileup. */ @Test - public void testWholeIndelReadWithoutExtendedEvents() { + public void testWholeIndelRead() { final int firstLocus = 44367788, secondLocus = firstLocus + 1; SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); @@ -280,70 +172,6 @@ public class LocusIteratorByStateUnitTest extends BaseTest { Assert.assertEquals(numAlignmentContextsFound,2,"Found incorrect number of alignment contexts"); } - /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) do - * not negatively influence the ordering of the pileup. - */ - @Test - public void testWholeIndelReadWithExtendedEvents() { - final int firstLocus = 44367788, secondLocus = firstLocus + 1; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - - SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); - leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); - leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); - leadingRead.setCigarString("1M75I"); - - SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); - indelOnlyRead.setCigarString("76I"); - - SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,1); - fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',1)); - fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',1)); - fullMatchAfterIndel.setCigarString("1M"); - - List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - Assert.assertTrue(li.hasNext(),"Missing first locus at " + firstLocus); - AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus,"Incorrect locus at this position; should be " + firstLocus); - List readsAtLocus = alignmentContext.getBasePileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + firstLocus); - Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + firstLocus); - - Assert.assertTrue(li.hasNext(),"Missing extended event at " + firstLocus); - alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus,"Incorrect extended event locus at this position; should be " + firstLocus); - readsAtLocus = alignmentContext.getExtendedEventPileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),3,"Wrong number of reads at extended event locus " + firstLocus); - Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at extended event locus " + firstLocus); - Assert.assertSame(readsAtLocus.get(1),indelOnlyRead,"indelOnlyRead absent from pileup at extended event locus " + firstLocus); - // Weird, but as above, reads immediately after the indel are included in the extended event pileup - Assert.assertSame(readsAtLocus.get(2),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at extended event locus " + firstLocus); - - // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read - // and considers it to be an indel-containing read. - Assert.assertTrue(li.hasNext(),"Missing base pileup at " + secondLocus); - alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),secondLocus,"Incorrect extended event locus at this position; should be " + secondLocus); - readsAtLocus = alignmentContext.getBasePileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),3,"Wrong number of reads at extended event locus " + secondLocus); - Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at extended event locus " + secondLocus); - Assert.assertSame(readsAtLocus.get(1),indelOnlyRead,"indelOnlyRead absent from pileup at extended event locus " + secondLocus); - // Weird, but as above, reads immediately after the indel are included in the extended event pileup - Assert.assertSame(readsAtLocus.get(2),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at extended event locus " + secondLocus); - - Assert.assertFalse(li.hasNext(),"Too many alignment contexts"); - } - private static ReadProperties createTestReadProperties() { return new ReadProperties( Collections.emptyList(), @@ -354,7 +182,6 @@ public class LocusIteratorByStateUnitTest extends BaseTest { new ValidationExclusion(), Collections.emptyList(), false, - false, BAQ.CalculationMode.OFF, BAQ.QualityMode.DONT_MODIFY, null, // no BAQ diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index b3b9ab555..5759204cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -29,43 +29,48 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; + public class GATKReportUnitTest extends BaseTest { - @Test(enabled = false) + @Test public void testParse() throws Exception { - String reportPath = validationDataLocation + "exampleGATKReport.eval"; + String reportPath = validationDataLocation + "exampleGATKReportv1.tbl"; GATKReport report = new GATKReport(reportPath); + Assert.assertEquals(report.getVersion(), GATKReportVersion.V1_0); + Assert.assertEquals(report.getTables().size(), 5); GATKReportTable countVariants = report.getTable("CountVariants"); - Assert.assertEquals(countVariants.getVersion(), GATKReportVersion.V0_1); - Object countVariantsPK = countVariants.getPrimaryKey("none.eval.none.all"); - Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "100000"); - Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "99872"); + Object countVariantsPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "all"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "63025520"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "0"); + Assert.assertEquals(countVariants.get(countVariantsPK, "heterozygosity"), 4.73e-06); GATKReportTable validationReport = report.getTable("ValidationReport"); - Assert.assertEquals(validationReport.getVersion(), GATKReportVersion.V0_1); - Object validationReportPK = countVariants.getPrimaryKey("none.eval.none.known"); - Assert.assertEquals(validationReport.get(validationReportPK, "sensitivity"), "NaN"); + Object validationReportPK = countVariants.getPrimaryKeyByData("CountVariants", "dbsnp", "eval", "none", "novel"); + Assert.assertEquals(validationReport.get(validationReportPK, "PPV"), Double.NaN); } @DataProvider(name = "rightAlignValues") public Object[][] getRightAlignValues() { - return new Object[][] { - new Object[] {null, true}, - new Object[] {"null", true}, - new Object[] {"NA", true}, - new Object[] {"0", true}, - new Object[] {"0.0", true}, - new Object[] {"-0", true}, - new Object[] {"-0.0", true}, - new Object[] {String.valueOf(Long.MAX_VALUE), true}, - new Object[] {String.valueOf(Long.MIN_VALUE), true}, - new Object[] {String.valueOf(Float.MIN_NORMAL), true}, - new Object[] {String.valueOf(Double.MAX_VALUE), true}, - new Object[] {String.valueOf(Double.MIN_VALUE), true}, - new Object[] {String.valueOf(Double.POSITIVE_INFINITY), true}, - new Object[] {String.valueOf(Double.NEGATIVE_INFINITY), true}, - new Object[] {String.valueOf(Double.NaN), true}, - new Object[] {"hello", false} + return new Object[][]{ + new Object[]{null, true}, + new Object[]{"null", true}, + new Object[]{"NA", true}, + new Object[]{"0", true}, + new Object[]{"0.0", true}, + new Object[]{"-0", true}, + new Object[]{"-0.0", true}, + new Object[]{String.valueOf(Long.MAX_VALUE), true}, + new Object[]{String.valueOf(Long.MIN_VALUE), true}, + new Object[]{String.valueOf(Float.MIN_NORMAL), true}, + new Object[]{String.valueOf(Double.MAX_VALUE), true}, + new Object[]{String.valueOf(Double.MIN_VALUE), true}, + new Object[]{String.valueOf(Double.POSITIVE_INFINITY), true}, + new Object[]{String.valueOf(Double.NEGATIVE_INFINITY), true}, + new Object[]{String.valueOf(Double.NaN), true}, + new Object[]{"hello", false} }; } @@ -73,4 +78,156 @@ public class GATKReportUnitTest extends BaseTest { public void testIsRightAlign(String value, boolean expected) { Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); } + + private GATKReportTable makeBasicTable() { + GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); + GATKReportTable table = report.getTable("TableName"); + report.addRow("foo.1", "hello"); + report.addRow("foo.2", "world"); + return table; + } + + @Test + public void testDottedSampleName() { + GATKReportTable table = makeBasicTable(); + Object pk; + + pk = table.getPrimaryKeyByData("foo.1"); + Assert.assertEquals(table.get(pk, "value"), "hello"); + + pk = table.getPrimaryKeyByData("foo.2"); + Assert.assertEquals(table.get(pk, "value"), "world"); + } + + @Test + public void testFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.1")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.1", "hello")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.2")); + Assert.assertNotNull(table.findPrimaryKeyByData("foo.2", "world")); + Assert.assertNull(table.findPrimaryKeyByData("list", "longer", "than", "column", "count")); + Assert.assertNull(table.findPrimaryKeyByData("short")); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testEmptyFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + table.findPrimaryKeyByData(); + } + + @Test(expectedExceptions = NullPointerException.class) + public void testNullFindPrimaryKeyByData() { + GATKReportTable table = makeBasicTable(); + table.findPrimaryKeyByData((Object[]) null); + } + + @Test + public void testSimpleGATKReport() { + // Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome + GATKReport report = GATKReport.newSimpleReport("TableName", "Roger", "is", "Awesome"); + + // Add data to simple GATK report + report.addRow( 12, 23.45, true); + report.addRow("ans", '3', 24.5); + report.addRow("hi", "", 2.3); + + // Print the report to console + //report.print(System.out); + + try { + File file = createTempFile("GATKReportGatherer-UnitTest", ".tbl"); + //System.out.format("The temporary file" + " has been created: %s%n", file); + PrintStream ps = new PrintStream(file); + report.print(ps); + //System.out.println("File succesfully outputed!"); + GATKReport inputRead = new GATKReport(file); + //System.out.println("File succesfully read!"); + //inputRead.print(System.out); + Assert.assertTrue(report.isSameFormat(inputRead)); + + } catch (IOException x) { + System.err.format("IOException: %s%n", x); + } + + } + + @Test + public void testGATKReportGatherer() { + boolean displayPK = false; + + GATKReport report1, report2, report3; + report1 = new GATKReport(); + report1.addTable("TableName", "Description"); + report1.getTable("TableName").addPrimaryKey("id", displayPK); + report1.getTable("TableName").addColumn("colA", GATKReportDataType.String.getDefaultValue(), "%s"); + report1.getTable("TableName").addColumn("colB", GATKReportDataType.Character.getDefaultValue(), "%c"); + report1.getTable("TableName").set(1, "colA", "NotNum"); + report1.getTable("TableName").set(1, "colB", (char) 64); + + report2 = new GATKReport(); + report2.addTable("TableName", "Description"); + report2.getTable("TableName").addPrimaryKey("id", displayPK); + report2.getTable("TableName").addColumn("colA", GATKReportDataType.String.getDefaultValue(), "%s"); + report2.getTable("TableName").addColumn("colB", GATKReportDataType.Character.getDefaultValue(), "%c"); + report2.getTable("TableName").set(2, "colA", "df3"); + report2.getTable("TableName").set(2, "colB", 'A'); + + report3 = new GATKReport(); + report3.addTable("TableName", "Description"); + report3.getTable("TableName").addPrimaryKey("id", displayPK); + report3.getTable("TableName").addColumn("colA", GATKReportDataType.String.getDefaultValue(), "%s"); + report3.getTable("TableName").addColumn("colB", GATKReportDataType.Character.getDefaultValue(), "%c"); + report3.getTable("TableName").set(3, "colA", "df5f"); + report3.getTable("TableName").set(3, "colB", 'c'); + + report1.combineWith(report2); + report1.combineWith(report3); + + report1.addTable("Table2", "To contain some more data types"); + GATKReportTable table = report1.getTable("Table2"); + table.addPrimaryKey("KEY"); + table.addColumn("SomeInt", GATKReportDataType.Integer.getDefaultValue(), true, "%d"); + table.addColumn("SomeFloat", GATKReportDataType.Decimal.getDefaultValue(), true, "%.16E"); + table.addColumn("TrueFalse", false, true, "%B"); + table.set("12df", "SomeInt", Byte.MAX_VALUE); + table.set("12df", "SomeFloat", 34.0); + table.set("12df", "TrueFalse", true); + table.set("5f", "SomeInt", Short.MAX_VALUE); + table.set("5f", "SomeFloat", Double.MAX_VALUE); + table.set("5f", "TrueFalse", false); + table.set("RZ", "SomeInt", Long.MAX_VALUE); + table.set("RZ", "SomeFloat", 535646345.657453464576); + table.set("RZ", "TrueFalse", true); + + report1.addTable("Table3", "blah"); + report1.getTable("Table3").addPrimaryKey("HAI"); + report1.getTable("Table3").addColumn("a", true, GATKReportDataType.String.getDefaultFormatString()); + report1.getTable("Table3").set("q", "a", "34"); + report1.getTable("Table3").set("5", "a", "c4g34"); + report1.getTable("Table3").set("573s", "a", "fDlwueg"); + report1.getTable("Table3").set("ZZZ", "a", "Dfs"); + + //report1.print(System.out); + + + try { + File file = createTempFile("GATKReportGatherer-UnitTest", ".tbl"); + //System.out.format("The temporary file" + " has been created: %s%n", file); + PrintStream ps = new PrintStream(file); + report1.print(ps); + //System.out.println("File succesfully outputed!"); + GATKReport inputRead = new GATKReport(file); + //System.out.println("File succesfully read!"); + //inputRead.print(System.out); + Assert.assertTrue(report1.isSameFormat(inputRead)); + Assert.assertTrue(report1.equals(inputRead)); + + } catch (IOException x) { + System.err.format("IOException: %s%n", x); + } + + //Assert.assertEquals(1,1); + + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index 7f21da4f4..85aa28a98 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -200,6 +200,13 @@ public class SampleDBUnitTest extends BaseTest { Assert.assertEquals(db.getChildrenWithParents(true), new HashSet(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED)))); } + @Test() + public void testGetFounderIds(){ + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies2)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFounderIds(), new HashSet(Arrays.asList("dad","mom","dad2","mom2","dad4"))); + } + @Test() public void loadFamilyIDs() { builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java new file mode 100755 index 000000000..d2acaa588 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/FlagStatIntegrationTest.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class FlagStatIntegrationTest extends WalkerTest { + + @Test + public void testFlagStat() { + String md5 = "9c4039662f24bfd23ccf67973cb5df29"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T FlagStat -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + 1, + Arrays.asList(md5)); + executeTest("test flag stat", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index e26d6174b..9d9b91872 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -23,16 +23,4 @@ public class PileupWalkerIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(expected_md5)); executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } - - @Test - public void testExtendedEventPileup() { - String gatk_args = "-T Pileup -I " + validationDataLocation + "OV-0930.normal.chunk.bam " - + "-R " + hg18Reference - + " -show_indels -o %s"; - String expected_md5="06eedc2e7927650961d99d703f4301a4"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args,1,Arrays.asList(expected_md5)); - executeTest("Testing the extended pileup with indel records included on a small chunk of Ovarian dataset with 20 indels (1 D, 19 I)", spec); - - - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java index a35348693..4b4946835 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java @@ -5,49 +5,52 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Arrays; -import java.util.HashMap; public class PrintReadsIntegrationTest extends WalkerTest { private static class PRTest { - final static String REF = hg18Reference; - final static String BAM = validationDataLocation + "HiSeq.1mb.bam"; - String args; - String md5; + final String reference; + final String bam; + final String args; + final String md5; - private PRTest(String args, String md5) { + private PRTest(String reference, String bam, String args, String md5) { + this.reference = reference; + this.bam = bam; this.args = args; this.md5 = md5; } + + @Override + public String toString() { + return String.format("PRTest(bam='%s', args='%s')", bam, args); + } } @DataProvider(name = "PRTest") - public Object[][] createData1() { + public Object[][] createPrintReadsTestData() { return new Object[][]{ - {new PRTest("", "dc8e5451dd29757c336013146010f73a")}, - {new PRTest(" -compress 0", "fde82269c78c9e91e57286433531b4af")}, - {new PRTest(" -simplifyBAM", "0531717b32a7e21c0de70b1526b0751f")}, - {new PRTest(" -n 10", "cdc4ddf9ee1d2ecf37168da8ef23c270")} }; + {new PRTest(hg18Reference, "HiSeq.1mb.bam", "", "dc8e5451dd29757c336013146010f73a")}, + {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -compress 0", "fde82269c78c9e91e57286433531b4af")}, + {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -simplifyBAM", "0531717b32a7e21c0de70b1526b0751f")}, + {new PRTest(hg18Reference, "HiSeq.1mb.bam", " -n 10", "cdc4ddf9ee1d2ecf37168da8ef23c270")}, + // See: GATKBAMIndex.getStartOfLastLinearBin(), BAMScheduler.advance(), IntervalOverlapFilteringIterator.advance() + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", "", "0a9ce949d07a84cb33a1a8e3358bf679")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e920b8505e7e95d67634b0905237dbc")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "13bb9a91b1d4dd2425f73302b8a1ac1c")}, + {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e920b8505e7e95d67634b0905237dbc")}, + {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")} + }; } @Test(dataProvider = "PRTest") public void testPrintReads(PRTest params) { WalkerTestSpec spec = new WalkerTestSpec( - "-T PrintReads -R " + params.REF + - " -I " + params.BAM + + "-T PrintReads" + + " -R " + params.reference + + " -I " + validationDataLocation + params.bam + params.args + " -o %s", Arrays.asList(params.md5)); executeTest("testPrintReads-"+params.args, spec).getFirst(); } - - @Test - public void testPrintReadsReadAllInsertion() { - WalkerTestSpec spec = new WalkerTestSpec( - "-T PrintReads -R " + b37KGReference + - " -I " + validationDataLocation + "oneReadAllInsertion.bam" + - " -o %s", - Arrays.asList("6caec4f8a25befb6aba562955401af93")); - executeTest("testPrintReads-oneReadAllInsertion", spec); - } } - diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java new file mode 100644 index 000000000..250a3d368 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.activeregionqc; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests CountReadsInActiveRegions + */ +public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest { + @Test + public void basicTest() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s", + 1, + Arrays.asList("942d067e6863a3f3524f67dc0aa02ef2")); + executeTest("CountReadsInActiveRegions:", spec); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 02026b375..7a0d78b88 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -184,7 +184,18 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { "-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf" + " -L " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf -NO_HEADER -ped " + validationDataLocation + "ug.random50000.family.ped -o %s", 1, Arrays.asList(MD5)); - executeTest("Testing TDT annotation", spec); + executeTest("Testing TDT annotation ", spec); + } + + + @Test + public void testChromosomeCountsPed() { + final String MD5 = "32df3ceb63c277df442ed55fb8684933"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf" + + " -L " + validationDataLocation + "ug.random50000.subset300bp.chr1.family.vcf -NO_HEADER -ped " + validationDataLocation + "ug.random50000.family.ped -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing ChromosomeCounts annotation with PED file", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java new file mode 100644 index 000000000..3829d2808 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -0,0 +1,84 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.LinkedList; +import java.util.List; + +/** + * @author Mauricio Carneiro + * @since 3/7/12 + */ +public class BQSRGathererUnitTest { + RecalibrationArgumentCollection RAC; + + private static File recal = new File("public/testdata/exampleGRP.grp"); + + //todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure + @Test(enabled = false) + public void testCombineSimilarFiles() { + BQSRGatherer gatherer = new BQSRGatherer(); + List recalFiles = new LinkedList (); + File output = new File("foo.grp"); + recalFiles.add(recal); + recalFiles.add(recal); + gatherer.gather(recalFiles, output); + + GATKReport originalReport = new GATKReport(recal); + GATKReport calculatedReport = new GATKReport(output); + for (GATKReportTable originalTable : originalReport.getTables()) { + GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName()); + List columnsToTest = new LinkedList(); + columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); + columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); + if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL + columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME); + testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1); + } + + else if (originalTable.getTableName().equals(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE)) { + columnsToTest.add(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME); + testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2); + } + + else if (originalTable.getTableName().startsWith("RecalTable")) { + testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2); + } + } + } + + /** + * Common testing functionality given the columns to test and the multiplication factor to the expected result + * + * @param original the original table + * @param calculated the calculated table + * @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor) + * @param factor 1 to test for equality, any other value to multiply the original value and match with the calculated + */ + private void testTablesWithColumnsAndFactor(GATKReportTable original, GATKReportTable calculated, List columnsToTest, int factor) { + for (Object primaryKey : original.getPrimaryKeys()) { // tables don't necessarily have the same primary keys + for (String column : columnsToTest) { + Object actual = calculated.get(primaryKey, column); + Object expected = original.get(primaryKey, column); + + if (factor != 1) { + if (expected instanceof Double) + expected = (Double) expected * factor; + else if (expected instanceof Long) + expected = (Long) expected * factor; + else if (expected instanceof Integer) + expected = (Integer) expected * factor; + else if (expected instanceof Byte) { + expected = (Byte) expected * factor; + } + } + Assert.assertEquals(actual, expected, "Primary key: " + primaryKey + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName()); + } + } + + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java new file mode 100644 index 000000000..c65cc3f63 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManagerUnitTest.java @@ -0,0 +1,143 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.LinkedList; +import java.util.List; + +/** + * @author Mauricio Carneiro + * @since 3/7/12 + */ +public class BQSRKeyManagerUnitTest { + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + } + + @Test(enabled = false) + public void testCombineBitSets() { + final int nRequired = 2; + final ArrayList covariates = new ArrayList(); + covariates.add(new ReadGroupCovariate()); + covariates.add(new QualityScoreCovariate()); + covariates.add(new CycleCovariate()); + covariates.add(new ContextCovariate()); + createReadAndTest(covariates, nRequired); + } + + @Test(enabled = true) + public void testOnlyRequiredCovariates() { + final int nRequired = 2; + final ArrayList covariates = new ArrayList(2); + covariates.add(new ReadGroupCovariate()); + covariates.add(new QualityScoreCovariate()); + createReadAndTest(covariates, nRequired); + } + + @Test(enabled = true) + public void testOnlyOneCovariate() { + final int nRequired = 1; + final ArrayList covariates = new ArrayList(2); + covariates.add(new ReadGroupCovariate()); + createReadAndTest(covariates, nRequired); + } + + @Test(enabled = false) + public void testOneCovariateWithOptionalCovariates() { + final int nRequired = 1; + final ArrayList covariates = new ArrayList(4); + covariates.add(new ReadGroupCovariate()); + covariates.add(new QualityScoreCovariate()); + covariates.add(new CycleCovariate()); + covariates.add(new ContextCovariate()); + createReadAndTest(covariates, nRequired); + } + + private void createReadAndTest(List covariates, int nRequired) { + int readLength = 1000; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(ReadUtils.createRandomReadBases(readLength, true), ReadUtils.createRandomReadQuals(readLength), readLength + "M"); + read.setReadGroup(new GATKSAMReadGroupRecord("ID")); + read.getReadGroup().setPlatform("illumina"); + + runTestOnRead(read, covariates, nRequired); + read.setReadNegativeStrandFlag(true); + runTestOnRead(read, covariates, nRequired); + read.setReadPairedFlag(true); + read.setSecondOfPairFlag(true); + runTestOnRead(read, covariates, nRequired); + read.setReadNegativeStrandFlag(false); + runTestOnRead(read, covariates, nRequired); + } + + private void runTestOnRead(GATKSAMRecord read, List covariateList, int nRequired) { + final BitSet[][][] covariateKeys = new BitSet[covariateList.size()][EventType.values().length][]; + int i = 0; + for (Covariate cov : covariateList) { + cov.initialize(RAC); + CovariateValues covValues = cov.getValues(read); + covariateKeys[i][EventType.BASE_SUBSTITUTION.index] = covValues.getMismatches(); + covariateKeys[i][EventType.BASE_INSERTION.index] = covValues.getInsertions(); + covariateKeys[i][EventType.BASE_DELETION.index] = covValues.getDeletions(); + i++; + } + List requiredCovariates = new LinkedList(); + List optionalCovariates = new LinkedList(); + + for (int j=0; j hashKeys = keyManager.bitSetsFromAllKeys(keySet, EventType.eventFrom(eventType)); + short cov = 0; + for (BitSet key : hashKeys) { + Object[] actual = keyManager.keySetFrom(key).toArray(); + + // Build the expected array + Object[] expected = new Object[nRequired + (optionalCovariates.size() > 0 ? 3 : 1)]; + System.arraycopy(expectedRequired, 0, expected, 0, nRequired); + if (optionalCovariates.size() > 0) { + expected[expected.length-3] = expectedCovariate[cov]; + expected[expected.length-2] = optionalCovariates.get(cov++).getClass().getSimpleName().split("Covariate")[0]; + } + expected[expected.length-1] = EventType.eventFrom(eventType); + +// System.out.println("Actual : " + Utils.join(",", Arrays.asList(actual))); +// System.out.println("Expected: " + Utils.join(",", Arrays.asList(expected))); +// System.out.println(); + + for (int k = 0; k < expected.length; k++) + Assert.assertEquals(actual[k], expected[k]); + } + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java index aa6a72ef9..5a522e81e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -1,103 +1,63 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.util.BitSet; -import java.util.Random; /** - * Short one line description of the walker. - * - *

- * [Long description of the walker] - *

- * - * - *

Input

- *

- * [Description of the Input] - *

- * - *

Output

- *

- * [Description of the Output] - *

- * - *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T [walker name]
- *  
- * * @author Mauricio Carneiro * @since 3/1/12 */ public class ContextCovariateUnitTest { ContextCovariate covariate; RecalibrationArgumentCollection RAC; - Random random; @BeforeClass public void init() { RAC = new RecalibrationArgumentCollection(); covariate = new ContextCovariate(); - random = GenomeAnalysisEngine.getRandomGenerator(); covariate.initialize(RAC); } @Test(enabled = true) public void testSimpleContexts() { - byte [] quals = createRandomReadQuals(101); - byte [] bbases = createRandomReadBases(101); - String bases = stringFrom(bbases); - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + GATKSAMRecord read = ReadUtils.createRandomRead(1000); + GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); CovariateValues values = covariate.getValues(read); - verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); - verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); - verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); - } - - private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { - for (int i=0; i= contextSize) - Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i)); - else - Assert.assertNull(values[i]); - } + verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, clippedRead, covariate); } - private String stringFrom(byte [] array) { + public static void verifyCovariateArray(BitSet[] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + for (int i = 0; i < values.length; i++) + Assert.assertEquals(contextCovariate.keyFromBitSet(values[i]), expectedContext(read, i, contextSize)); + + } + + public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { + final String bases = stringFrom(read.getReadBases()); + String expectedContext = null; + if (offset - contextSize + 1 >= 0) { + String context = bases.substring(offset - contextSize + 1, offset + 1); + if (!context.contains("N")) + expectedContext = context; + } + return expectedContext; + } + + private static String stringFrom(byte[] array) { String s = ""; for (byte value : array) s += (char) value; return s; } - private byte [] createRandomReadQuals(int length) { - byte [] quals = new byte[length]; - for (int i=0; i requestedCovariates = new ArrayList(4); + requestedCovariates.add(rgCov); + requestedCovariates.add(qsCov); + requestedCovariates.add(coCov); + requestedCovariates.add(cyCov); + + ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, length); + Assert.assertEquals(rc.getInsertionsKeySet().length, length); + Assert.assertEquals(rc.getDeletionsKeySet().length, length); + + for (int i = 0; i < length; i++) { + // check that read group is always the same + Assert.assertEquals(rgCov.keyFromBitSet(rc.getMismatchesKeySet(i)[0]), RGID); + Assert.assertEquals(rgCov.keyFromBitSet(rc.getInsertionsKeySet(i)[0]), RGID); + Assert.assertEquals(rgCov.keyFromBitSet(rc.getDeletionsKeySet(i)[0]), RGID); + + // check quality score + Assert.assertEquals(qsCov.keyFromBitSet(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); + Assert.assertEquals(qsCov.keyFromBitSet(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); + Assert.assertEquals(qsCov.keyFromBitSet(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); + + // check context + Assert.assertEquals(coCov.keyFromBitSet(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); + Assert.assertEquals(coCov.keyFromBitSet(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INSERTIONS_CONTEXT_SIZE)); + Assert.assertEquals(coCov.keyFromBitSet(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.DELETIONS_CONTEXT_SIZE)); + + // check cycle + Assert.assertEquals(cyCov.keyFromBitSet(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.keyFromBitSet(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.keyFromBitSet(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); + } + + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java new file mode 100644 index 000000000..f087ef0dd --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java @@ -0,0 +1,57 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.BitSet; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ReadGroupCovariateUnitTest { + ReadGroupCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ReadGroupCovariate(); + covariate.initialize(RAC); + } + + @Test(enabled = true) + public void testSingleRecord() { + final String expected = "SAMPLE.1"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setPlatformUnit(expected); + runTest(rg, expected); + } + + @Test(enabled = true) + public void testMissingPlatformUnit() { + final String expected = "MY.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); + runTest(rg, expected); + } + + private void runTest(GATKSAMReadGroupRecord rg, String expected) { + GATKSAMRecord read = ReadUtils.createRandomRead(10); + read.setReadGroup(rg); + CovariateValues values = covariate.getValues(read); + verifyCovariateArray(values.getMismatches(), expected); + + } + + private void verifyCovariateArray(BitSet[] values, String expected) { + for (BitSet value : values) { + String actual = covariate.keyFromBitSet(value); + Assert.assertEquals(actual, expected); + } + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java new file mode 100644 index 000000000..b39d21d80 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java @@ -0,0 +1,130 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class RecalibrationReportUnitTest { + @Test(enabled = false) + public void testOutput() { + final int length = 100; + + List quals = new ArrayList(QualityUtils.MAX_QUAL_SCORE + 1); + List counts = new ArrayList(QualityUtils.MAX_QUAL_SCORE + 1); + + for (int i = 0; i<= QualityUtils.MAX_QUAL_SCORE; i++) { + quals.add((byte) i); + counts.add(1L); + } + + final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final LinkedHashMap> keysAndTablesMap = new LinkedHashMap>(); + + quantizationInfo.noQuantization(); + final List requiredCovariates = new LinkedList(); + final List optionalCovariates = new LinkedList(); + final List requestedCovariates = new LinkedList(); + + final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + final BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(rgKeyManager, new HashMap()); + + final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + final BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(qsKeyManager, new HashMap()); + + final ContextCovariate cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + final CycleCovariate cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(cvKeyManager, new HashMap()); + + for (Covariate cov : requiredCovariates) + requestedCovariates.add(cov); + for (Covariate cov : optionalCovariates) + requestedCovariates.add(cov); + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); + rg.setPlatform("illumina"); + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + read.setReadGroup(rg); + final byte [] readQuals = new byte[length]; + for (int i = 0; i < length; i++) + readQuals[i] = 20; + read.setBaseQualities(readQuals); + + + final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INSERTIONS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + int nKeys = 0; // keep track of how many keys were produced + final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); + for (int offset = 0; offset < length; offset++) { + for (Map.Entry> entry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = entry.getKey(); + Map table = entry.getValue(); + + for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getMismatchesKeySet(offset), EventType.BASE_SUBSTITUTION)) { + table.put(key, RecalDatum.createRandomRecalDatum(10000, 10)); + nKeys++; + } + + for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getInsertionsKeySet(offset), EventType.BASE_INSERTION)) { + table.put(key, RecalDatum.createRandomRecalDatum(100000, 10)); + nKeys++; + } + + + for (BitSet key : keyManager.bitSetsFromAllKeys(rc.getDeletionsKeySet(offset), EventType.BASE_DELETION)) { + table.put(key, RecalDatum.createRandomRecalDatum(100000, 10)); + nKeys++; + } + + } + } + Assert.assertEquals(nKeys, expectedKeys); + + RecalibrationReport report = new RecalibrationReport(quantizationInfo, keysAndTablesMap, RAC.generateReportTable(), RAC); + + File output = new File("RecalibrationReportUnitTestOutuput.grp"); + PrintStream out; + try { + out = new PrintStream(output); + } catch (FileNotFoundException e) { + throw new ReviewedStingException("couldn't create the file " + output, e); + } + report.output(out); + + RecalibrationReport loadedReport = new RecalibrationReport(output); + + Assert.assertTrue(report.equals(loadedReport)); + if (!output.delete()) + throw new ReviewedStingException("File could not be deleted " + output); + } + + private static int expectedNumberOfKeys (int nCovariates, int readLength, int indelContextSize, int mismatchesContextSize) { + int nommcs = readLength >= mismatchesContextSize ? mismatchesContextSize-1 : readLength; + int noincs = readLength >= indelContextSize ? 2*(indelContextSize-1) : 2*readLength; + return (nCovariates * readLength * 3) - nommcs - noincs; + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java index accb9c0cf..7c705de18 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java @@ -35,7 +35,7 @@ public class ErrorRatePerCycleIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ErrorRatePerCycle -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-10,100,000 -o %s", 1, - Arrays.asList("0cc212ecb6df300e321784039ff29f13")); + Arrays.asList("71685716c7dde64c51bbd908c06ea742")); executeTest("ErrorRatePerCycle:", spec); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java index 1a4c8db30..0f3750abd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java @@ -38,7 +38,7 @@ public class ReadGroupPropertiesIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s", 1, - Arrays.asList("6b8cce223af28cbadcfe87a3b841fc56")); + Arrays.asList("3f1f97a1d2c5fb552ed4f33ea30d136d")); executeTest("ReadGroupProperties:", spec); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index 9b79653c6..4a83c34cc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -50,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "da3dc85a0e35a9aade5520591891b4fa"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "7dc8200730313e6753237a696296fb73"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dba5eab2b9587c1062721b164e4fd9a6"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "de35c93450b46db5fc5516af3c55d62a"); return TestParams.getTests(TestParams.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java new file mode 100644 index 000000000..b1720e509 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.util.*; + + +public class ArtificialReadPileupTestProvider { + final int contigStart = 1; + final int contigStop = 10; + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigStop - contigStart + 1); +// final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic"); + final String artificialContig = "chr1"; + // final int artificialContigIndex = 0; + final String artificialReadName = "synth"; + final int artificialRefStart = 1; + final int artificialMappingQuality = 60; + Map sample2RG = new HashMap(); + List sampleRGs; + + final String refBases = "AGGATACTGT"; + List sampleNames = new ArrayList(); + private String sampleName(int i) { return sampleNames.get(i); } + private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } + public final int offset = 5; + public final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset); + public final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10); + public final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes()); + + byte BASE_QUAL = 50; + + public ArtificialReadPileupTestProvider(final int numSamples, final String SAMPLE_PREFIX) { + sampleRGs = new ArrayList(); + + for ( int i = 0; i < numSamples; i++ ) { + sampleNames.add(String.format("%s%04d", SAMPLE_PREFIX, i)); + SAMReadGroupRecord rg = createRG(sampleName(i)); + sampleRGs.add(rg); + sample2RG.put(sampleName(i), rg); + } + + } + + public ArtificialReadPileupTestProvider(final int numSamples, final String SAMPLE_PREFIX, final byte q) { + this(numSamples,SAMPLE_PREFIX); + BASE_QUAL = q; + } + public List getSampleNames() { + return sampleNames; + } + public byte getRefByte() { + return refBases.substring(offset,offset+1).getBytes()[0]; + } + + public ReferenceContext getReferenceContext() { return referenceContext;} + public GenomeLocParser getGenomeLocParser() { return genomeLocParser; } + + public Map getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele) { + // RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext); + + + ArrayList vcAlleles = new ArrayList(); + Allele refAllele, altAllele; + if (eventLength == 0) {// SNP case + refAllele =Allele.create(refBases.substring(offset,offset+1),true); + altAllele = Allele.create(altBases.substring(0,1), false); + + } else if (eventLength>0){ + // insertion + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(altBases.substring(0,eventLength), false); + } + else { + // deletion + refAllele =Allele.create(refBases.substring(offset,offset+Math.abs(eventLength)),true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + int stop = loc.getStart(); + vcAlleles.add(refAllele); + vcAlleles.add(altAllele); + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(vcAlleles); + builder.referenceBaseForIndel(referenceContext.getBase()); + builder.noGenotypes(); + + final VariantContext vc = builder.make(); + + Map contexts = new HashMap(); + + for (String sample: sampleNames) { + AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample)); + contexts.put(sample,context); + + } + + return contexts; + } + + private SAMReadGroupRecord createRG(String name) { + SAMReadGroupRecord rg = new SAMReadGroupRecord(name); + rg.setPlatform("ILLUMINA"); + rg.setSample(name); + return rg; + } + private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases, + int[] numReadsPerAllele, String sample) { + List pileupElements = new ArrayList(); + int readStart = contigStart; + int offset = (contigStop-contigStart+1)/2; + int refAlleleLength = 0; + int readCounter = 0; + int alleleCounter = 0; + for (Allele allele: vc.getAlleles()) { + if (allele.isReference()) + refAlleleLength = allele.getBases().length; + + int alleleLength = allele.getBases().length; + + for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) { + byte[] readBases = trueHaplotype(allele, offset, refAlleleLength); + byte[] readQuals = new byte[readBases.length]; + Arrays.fill(readQuals, (byte)BASE_QUAL); + + GATKSAMRecord read = new GATKSAMRecord(header); + read.setBaseQualities(readQuals); + read.setReadBases(readBases); + read.setReadName(artificialReadName+readCounter++); + + boolean isBeforeDeletion = false, isBeforeInsertion = false; + if (allele.isReference()) + read.setCigarString(readBases.length + "M"); + else { + isBeforeDeletion = alleleLengthrefAlleleLength; + if (isBeforeDeletion || isBeforeInsertion) + read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") + + (readBases.length-offset)+"M"); + else // SNP case + read.setCigarString(readBases.length+"M"); + } + + int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0)); + read.setReadPairedFlag(false); + read.setAlignmentStart(readStart); + read.setMappingQuality(artificialMappingQuality); + read.setReferenceName(loc.getContig()); + read.setReadNegativeStrandFlag(false); + read.setAttribute("RG", sampleRG(sample).getReadGroupId()); + + + pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength)); + } + alleleCounter++; + } + + return new ReadBackedPileupImpl(loc,pileupElements); + } + + private byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) { + // create haplotype based on a particular allele + String prefix = refBases.substring(offset); + String alleleBases = new String(allele.getBases()); + String postfix = refBases.substring(offset+refAlleleLength,refBases.length()); + + return (prefix+alleleBases+postfix).getBytes(); + + + + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index c7d196b53..964d768c4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { static double[] AA1, AB1, BB1; static double[] AA2, AB2, AC2, BB2, BC2, CC2; static final int numSamples = 3; - static double[][] priors = new double[2][2*numSamples+1]; // flat priors + static double[] priors = new double[2*numSamples+1]; // flat priors @BeforeSuite public void before() { @@ -83,26 +82,30 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); - for ( int i = 0; i < 2; i++ ) { - for ( int j = 0; j < 2*numSamples+1; j++ ) { - result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) { - Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero); - } else { - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } + + @Test + public void testLargeGLs() { + + final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; + GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); + + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); + + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); + + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; + Assert.assertEquals(calculatedAlleleCount, 6); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java new file mode 100644 index 000000000..e4c3b8dae --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -0,0 +1,102 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.util.*; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 3/22/12 + * Time: 11:24 AM + * To change this template use File | Settings | File Templates. + */ +public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { + + final String refBases = "AGGATACTGT"; + final int nSamples = 1; + final int[] numReadsPerAllele = new int[]{10,10}; + final String SAMPLE_PREFIX = "sample"; + + + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + final Logger logger = Logger.getLogger(Walker.class); + final IndelGenotypeLikelihoodsCalculationModel model = new IndelGenotypeLikelihoodsCalculationModel(UAC,logger); + + ArtificialReadPileupTestProvider pileupProvider; + + @BeforeSuite + public void before() { + pileupProvider = new ArtificialReadPileupTestProvider(nSamples, SAMPLE_PREFIX); + } + + @Test + public void testBasicConsensusCounts() { + // 4 inserted bases, min cnt = 10 + String altBases = "CCTCCTGAGA"; + int eventLength = 4; + List alleles = getConsensusAlleles(eventLength,true,10,0.1, altBases); + + Assert.assertEquals(alleles.size(),2); + Assert.assertEquals(alleles.get(1).getBaseString(), altBases.substring(0,eventLength)); + + + + //altBases = "CCTCMTGAGA"; + + eventLength = 3; + alleles = getConsensusAlleles(eventLength,false,10,0.1, altBases); + Assert.assertEquals(alleles.size(),2); + Assert.assertEquals(alleles.get(0).getBaseString(), refBases.substring(pileupProvider.offset,pileupProvider.offset+eventLength)); + + // same with min Reads = 11 + alleles = getConsensusAlleles(eventLength,false,11,0.1, altBases); + Assert.assertEquals(alleles.size(),0); + + // increase required fraction per sample to just below threshold + alleles = getConsensusAlleles(eventLength,false,10,0.49999, altBases); + Assert.assertEquals(alleles.size(),2); + alleles = getConsensusAlleles(eventLength,false,10,0.5001, altBases); + Assert.assertEquals(alleles.size(),0); + } + + private List getConsensusAlleles(int eventLength, boolean isInsertion, int minCnt, double minFraction, String altBases) { + final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(pileupProvider.genomeLocParser, true, minCnt, minFraction); + return counter.computeConsensusAlleles(pileupProvider.referenceContext, + pileupProvider.getAlignmentContextFromAlleles(isInsertion?eventLength:-eventLength,altBases,numReadsPerAllele), + AlignmentContextUtils.ReadOrientation.COMPLETE); + + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index b3bd0253c..464dfb06e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -17,8 +17,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232")); + Arrays.asList("9b08dc6800ba11bc6d9f6ccf392a60fe")); executeTest("test MultiSample Pilot1", spec); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("c5b53231f4f6d9524bc4ec8115f44f5c")); + Arrays.asList("d275e0f75368dbff012ea8655dce3444")); executeTest("test SingleSample Pilot2", spec); } @@ -62,17 +62,33 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); + Arrays.asList("ec907c65da5ed9b6046404b0f81422d4")); executeTest("test Multiple SNP alleles", spec); } + @Test + public void testBadRead() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH -I " + validationDataLocation + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + Arrays.asList("7678827a2ee21870a41c09d28d26b996")); + executeTest("test bad read", spec); + } + + @Test + public void testReverseTrim() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("a70593bbb5042e2d0e46e3c932cae170")); + executeTest("test reverse trim", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing compressed output // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "a08df9aea2b3df09cf90ff8e6e3be3ea"; + private final static String COMPRESSED_OUTPUT_MD5 = "1e3c897794e5763a8720807686707b18"; @Test public void testCompressedOutput() { @@ -93,7 +109,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "6358934c1c26345013a38261b8c45aa4"; + String md5 = "06d11ed89f02f08911e100df0f7db7a4"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -122,32 +138,43 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test - public void testCallingParameters() { - HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "258c1b33349eb3b2d395ec4d69302725" ); - - for ( Map.Entry entry : e.entrySet() ) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 " + entry.getKey(), 1, - Arrays.asList(entry.getValue())); - executeTest(String.format("test calling parameter[%s]", entry.getKey()), spec); - } + public void testMinBaseQualityScore() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, + Arrays.asList("258c1b33349eb3b2d395ec4d69302725")); + executeTest("test min_base_quality_score 26", spec); } @Test public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); + Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57")); executeTest("test SLOD", spec); } + @Test + public void testNDA() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("443b2f8882393c4c65277c34cdb6060c")); + executeTest("test NDA", spec); + } + + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("71251d8893649ea9abd5d9aa65739ba1")); + executeTest("test using comp track", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); - e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" ); + e.put( "--output_mode EMIT_ALL_SITES", "e10819a2a7960254e27ed2b958b45d56" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -181,8 +208,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "926b58038dd4989bf7eda697a847eea9" ); - e.put( 1.0 / 1850, "93f44105b43b65730a3b821e27b0fa16" ); + e.put( 0.01, "d07e5ca757fbcb1c03f652f82265c2f8" ); + e.put( 1.0 / 1850, "d1fb9186e6f39f2bcf5d0edacd8f7fe2" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -206,7 +233,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("a1b75a7e12b160b0be823228c958573f")); + Arrays.asList("623be1fd8b63a01bfe35ac864d5199fe")); executeTest(String.format("test multiple technologies"), spec); } @@ -225,7 +252,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("3bda1279cd6dcb47885f3e19466f11b9")); + Arrays.asList("40ea10c0238c3be2991d31ae72476884")); executeTest(String.format("test calling with BAQ"), spec); } @@ -244,7 +271,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("d9fc3ba94a0d46029778c7b457e7292a")); + Arrays.asList("c9b0bd900a4ec949adfbd28909581eeb")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -259,7 +286,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("b2e30ae3e5ffa6108f9f6178b1d2e679")); + Arrays.asList("6b7c8691c527facf9884c2517d943f2f")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -272,7 +299,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("2cd182a84613fa91a6020466d2d327e2")); + Arrays.asList("d72603aa33a086d64d4dddfd2995552f")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -282,7 +309,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("9cd08dc412a007933381e9c76c073899")); + Arrays.asList("4a59fe207949b7d043481d7c1b786573")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -292,7 +319,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53")); + Arrays.asList("a8a9ccf30bddee94bb1d300600794ee7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -300,13 +327,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("52340d578a708fa709b69ce48987bc9d")); + Arrays.asList("99e278baa2367b2bb016e2f37139d12f")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("9566c7abef5ee5829a516d90445b347f")); + Arrays.asList("c43ac445130161b8250bfbdc6c67782a")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -335,4 +362,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { UserException.class); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + + final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; + + @Test + public void testMinIndelFraction0() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.0", 1, + Arrays.asList("973178b97efd2daacc9e45c414275d59")); + executeTest("test minIndelFraction 0.0", spec); + } + + @Test + public void testMinIndelFraction25() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 0.25", 1, + Arrays.asList("220facd2eb0923515d1d8ab874055564")); + executeTest("test minIndelFraction 0.25", spec); + } + + @Test + public void testMinIndelFraction100() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + assessMinIndelFraction + " -minIndelFrac 1", 1, + Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a")); + executeTest("test minIndelFraction 1.0", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index 26e23e016..5c0123769 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -26,7 +26,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { @Test public void testIntervals2() { - String md5 = "e0f745b79b679c225314a2abef4919ff"; + String md5 = "d073237694175c75d37bd4f40b8c64db"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator --known " + b36dbSNP129 + " -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,200,000 -o %s", diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index e1d22f107..b78e76e07 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -80,4 +80,24 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); } + @Test + public void test7() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) + + " -L chr20:332341-802503", + 1, + Arrays.asList("c37548b333b65f58d0edfc5c2a62a28a")); + executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec); + } + + @Test + public void test8() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) + + " -L chr20:332341-802503" + " -respectPhaseInInput", + 1, + Arrays.asList("dfc7cdddd702e63d46d04f61a3ecd720")); + executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java new file mode 100644 index 000000000..d7c866a0a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationSiteSelectorIntegrationTest.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.gatk.walkers.validation; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 3/26/12 + * Time: 3:29 PM + * To change this template use File | Settings | File Templates. + */ +public class ValidationSiteSelectorIntegrationTest extends WalkerTest { + public static String baseTestString(String args) { + return "-T ValidationSiteSelector -R " + b36KGReference + " -L 1 -o %s -NO_HEADER -numSites 100 " + args; + } + + private static String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + private static String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + private static String samplePrefix = " -sf " + samplesFile; + private static String freqUnif = " --frequencySelectionMode UNIFORM "; + private static String freqAF = " --frequencySelectionMode KEEP_AF_SPECTRUM "; + private static String sampleNone = " -sampleMode NONE "; + private static String sampleGT = samplePrefix+" -sampleMode POLY_BASED_ON_GT "; + private static String sampleGL = samplePrefix+" -sampleMode POLY_BASED_ON_GL -samplePNonref 0.95"; + + + @Test(enabled=true) + public void testNoSampleSelectionFreqUniform() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleNone + freqUnif + "--variant " + testfile), + 1, + Arrays.asList("d49baeb8000a426c172ce1d81eb37963") + ); + + executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec); + } + + @Test(enabled=true) + public void testNoSampleSelectionFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleNone + freqAF + "--variant " + testfile), + 1, + Arrays.asList("0fb0d015d462c34514fc7e96beea5f56") + ); + + executeTest("testNoSampleSelectionFreqAF--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGTFreqUniform() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGT + freqUnif + "--variant " + testfile), + 1, + Arrays.asList("0672854299d42ea8af906976a3849ae6") + ); + + executeTest("testPolyGTFreqUniform--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGTFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGT + freqAF + "--variant " + testfile), + 1, + Arrays.asList("5bdffda1a063d0bddd6b236854ec627d") + ); + + executeTest("testPolyGTFreqAF--" + testfile, spec); + } + + @Test(enabled=true) + public void testPolyGLFreqAF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(sampleGL + freqAF + "--variant " + testfile), + 1, + Arrays.asList("35ef16aa41303606a4b94f7b88bd9aa8") + ); + + executeTest("testPolyGLFreqAF--" + testfile, spec); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 36c093e8f..71c014f2c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -1,6 +1,31 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.varianteval; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -9,6 +34,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; + private static String fundamentalTestSNPsSplit1of2VCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.split_1_of_2.vcf"; + private static String fundamentalTestSNPsSplit2of2VCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.split_2_of_2.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.NA12045.vcf"; private static String cmdRoot = "-T VariantEval" + @@ -30,7 +57,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c8d8bffa5c572df9dec7364f71a1b943") + Arrays.asList("e87932ffa1d310cecee49e7829a0f056") ); executeTest("testFunctionClassWithSnpeff", spec); } @@ -50,7 +77,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("081fcaa532c7ba8f23da739389e6f7c3") + Arrays.asList("8279ee42a6785f9c2b3dda8d82674e00") ); executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); } @@ -70,7 +97,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("b3852f84d07c270b8a12874083c3e31b") + Arrays.asList("0bac64d5615f901d3005247c6d016549") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -91,7 +118,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("cf70468b5ebaec408419da69b0a7fcb9") + Arrays.asList("b84d8b4429116c887ceb5489c8782f00") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -113,7 +140,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("5e3b8b85acfc41365c8208c23abf746b") + Arrays.asList("e4f37642d9113a65fbe8bc1d091c206f") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -134,7 +161,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("ccdbc50d30ece6d0d3b199c397f03ed3") + Arrays.asList("c5412ee824b4815dc8eea62a4c5462ef") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -155,7 +182,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("95c690d5af8ed51573eb2f0503dcd9c2") + Arrays.asList("1d42e97643afd3e7f5f8c9f6416c5883") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -176,7 +203,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("8e8547eb38b34bec0095b0500fd9641d") + Arrays.asList("8c2ba70bed2f0fdb0ca371f7038819ef") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -197,7 +224,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("158a4651a656aea7f84c79548f6fe519") + Arrays.asList("c912b4b0bf1925d042119b301c183b93") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -220,7 +247,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("76c8a0b28d2993644120f7afa5833ab2") + Arrays.asList("dea3d2cc53265ff8ed2f0030c40f3747") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -245,7 +272,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("34682193f458b93b39efac00b4fc6723") + Arrays.asList("dede22b15936c38e29b850c805c7b706") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -264,7 +291,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("52f6655f1532bcea24b402010d93ce73") + Arrays.asList("9a94c4c613bf69feb3d9579c353baaf2") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -277,7 +304,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("c49e239292704447a36e01ee9a71e729")); + 1, Arrays.asList("8d4530e9cef8531c46bbb693b84d04c7")); executeTestParallel("testSelect1", spec); } @@ -287,7 +314,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -ST CpG --eval:VCF3 " + validationDataLocation + vcfFile + " --comp:VCF3 " + validationDataLocation + "GenotypeConcordanceComp.vcf -noEV -EV GenotypeConcordance -o %s", 1, - Arrays.asList("9a56c20a7b9a554a7b530f2cb1dd776d")); + Arrays.asList("9bbc762f459023af0480774eb2986af4")); executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec); } @@ -298,14 +325,14 @@ public class VariantEvalIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec("-T VariantEval -R "+b37KGReference+" --eval " + variantEvalTestDataRoot + vcfFile + " -ped "+ variantEvalTestDataRoot + pedFile +" -noEV -EV MendelianViolationEvaluator -L 1:10109-10315 -o %s -mvq 0 -noST", 1, - Arrays.asList("66e72c887124f40933d32254b2dd44a3")); + Arrays.asList("ddcabc30c88a755a78100e30e0d491d2")); executeTestParallel("testVEMendelianViolationEvaluator" + vcfFile, spec); } @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("fa13eb59892892c07711c6ffe31bf870")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bb076f7239039191fde883c5e68483ea")); executeTestParallel("testCompVsEvalAC",spec); } @@ -323,7 +350,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9002023b8aa8fc2c9aac58b8a79bca1e")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("98f9c2f5fef43dbda688d32360908615")); executeTestParallel("testCompOverlap",spec); } @@ -335,7 +362,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("e42cda858649a35eaa9d14ea2d70a956")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9d24f34d94d74417e00e3b7bcf84650f")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -347,7 +374,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9561cb4c7aa36dcf30ba253385299859")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("7329b0bc73c9ccaf5facd754f3410c38")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -364,13 +391,13 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("61052c19211e7eb61fbbb62db5e40b56")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d0218c5435c8601f2355b7d183ab032f")); executeTestParallel("testMultipleCompTracks",spec); } @Test public void testPerSampleAndSubsettedSampleHaveSameResults1() { - String md5 = "0edded1cd578db62fa296c99c34a909d"; + String md5 = "b5cd5c286d459b8edd4ca54320e560a3"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( @@ -412,24 +439,69 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testAlleleCountStrat() { WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T VariantEval", - "-R " + b37KGReference, - "--dbsnp " + b37dbSNP132, - "--eval " + fundamentalTestSNPsVCF, - "-noEV", - "-EV CountVariants", - "-noST", - "-ST AlleleCount", - "-L " + fundamentalTestSNPsVCF, - "-o %s" - ), - 1, - Arrays.asList("ee22604616b3e9fc48a6dcbbf73a056d") - ); + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsVCF, + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF, + "-o %s" + ), + 1, + Arrays.asList("1198bfea6183bd43219071a84c79a386") + ); executeTest("testAlleleCountStrat", spec); } + @Test + public void testMultipleEvalTracksAlleleCountWithMerge() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsSplit1of2VCF, + "--eval " + fundamentalTestSNPsSplit2of2VCF, + "--mergeEvals", + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF, + "-o %s" + ), + 1, + Arrays.asList("1198bfea6183bd43219071a84c79a386") + ); + executeTest("testMultipleEvalTracksAlleleCountWithMerge", spec); + } + + @Test + public void testMultipleEvalTracksAlleleCountWithoutMerge() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsSplit1of2VCF, + "--eval " + fundamentalTestSNPsSplit2of2VCF, + //"--mergeEvals", No merge with AC strat ==> error + "-noEV", + "-EV CountVariants", + "-noST", + "-ST AlleleCount", + "-L " + fundamentalTestSNPsVCF + ), + 0, + UserException.class + ); + executeTest("testMultipleEvalTracksAlleleCountWithoutMerge", spec); + } + @Test public void testIntervalStrat() { WalkerTestSpec spec = new WalkerTestSpec( @@ -446,7 +518,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("240369cd651c77e05e8a6659f4a6237e") + Arrays.asList("6decba040051daafad4ecad5a411e1e1") ); executeTest("testIntervalStrat", spec); } @@ -463,8 +535,60 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("397b0e77459b9b69d2e0dd1dac320c3c") + Arrays.asList("aad01b26198b30da5d59a05c08d863bb") ); executeTest("testModernVCFWithLargeIndels", spec); } + + @Test + public void testStandardIndelEval() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf", + "-L 20", + "-noST -ST Sample -ST OneBPIndel -ST TandemRepeat", + "-noEV -EV IndelSummary -EV IndelLengthHistogram", + "-gold " + validationDataLocation + "/Mills_and_1000G_gold_standard.indels.b37.sites.vcf", + "-D " + b37dbSNP132, + "-o %s" + ), + 1, + Arrays.asList("4fa2557663ef8fb4cdeecd667791985c") + ); + executeTest("testStandardIndelEval", spec); + } + + + @Test() + public void testIncompatibleEvalAndStrat() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf", + "-L 20 -noST -ST AlleleCount -noEV -EV VariantSummary" + ), + 0, + UserException.class); + executeTest("testIncompatibleEvalAndStrat", spec); + } + + public void testIncludingAC0(boolean includeAC0, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + testDir + "/ac0.vcf", + "-L 20:81006 -noST -noEV -EV VariantSummary -o %s" + (includeAC0 ? " -keepAC0" : "") + ), + 1, + Arrays.asList(md5)); + executeTest("testIncludingAC0 keep ac 0 = " + includeAC0, spec); + } + + @Test public void testWithAC0() { testIncludingAC0(true, "0ed2c8e4b4e06973a06838bc930a132d"); } + @Test public void testWithoutAC0() { testIncludingAC0(false, "79d28ddd0ab9584776b6cbefe48331df"); } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java new file mode 100644 index 000000000..ca06ca699 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalkerUnitTest.java @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.varianteval; + + +// the imports for unit testing. + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager.StratificationManager; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.EvaluationContext; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class VariantEvalWalkerUnitTest extends BaseTest { + VariantEvalWalker VEwalker; + VariantContext eval; + + + @BeforeMethod + public void init() { + VEwalker = new VariantEvalWalker(); + eval = new VariantContextBuilder("x", "chr1", 1, 1, Collections.singleton(Allele.create("A", true))).make(); + } + + // -------------------------------------------------------------------------------- + // + // Test stratifications / evaluations + // + // -------------------------------------------------------------------------------- + + private class StratifiedEvalTestProvider extends TestDataProvider { + final List stratificationObjects = new ArrayList(); + final Set> evaluationObjects = new HashSet>(); + final List expectedCounts; + final int maxI; + + /** + * + * @param maxI test integers from 1 ... maxI + * @param expectedCounts the expected number of integers from 1 ... maxI divisible by each combination, in order, of allStates + * @param allStates all stratification tests, in order + */ + public StratifiedEvalTestProvider(int maxI, + final List expectedCounts, + final List ... allStates) { + super(StratifiedEvalTestProvider.class); + + this.maxI = maxI; + this.expectedCounts = expectedCounts; + this.evaluationObjects.add(CounterEval.class); + + String stateName = ""; + for ( List states : allStates ) { + stratificationObjects.add(new IntegerStratifier(states)); + stateName = stateName + Utils.join(",", states) + " "; + } + + setName(String.format("maxI=%d expectedCounts=%s states=%s", maxI, Utils.join(",", expectedCounts), stateName)); + } + } + + /** + * Test stratifier -> holds a list of integers, and the states are if the integer value of evalName is divisable + * by that number + */ + public static class IntegerStratifier extends VariantStratifier { + final List integers; + + private IntegerStratifier(final List integers) { + this.integers = integers; + initialize(); + } + + @Override + public void initialize() { + states.addAll(integers); + } + + @Override + public List getRelevantStates(final ReferenceContext ref, final RefMetaDataTracker tracker, final VariantContext comp, final String compName, final VariantContext eval, final String evalName, final String sampleName) { + int i = Integer.valueOf(evalName); // a terrible hack, but we can now provide accessible states + List states = new ArrayList(); + for ( int state : integers ) + if ( i % state == 0 ) + states.add(state); + return states; + } + } + + /** + * Test evaluator -> just counts the number of calls to update1 + */ + public static class CounterEval extends VariantEvaluator { + public int count = 0; + + @Override public int getComparisonOrder() { return 1; } + + @Override + public void update1(final VariantContext eval, final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + count++; + } + + @Override + public boolean supportsCombine() { + return true; + } + + @Override + public void combine(final VariantEvaluator other) { + this.count += ((CounterEval)other).count; + } + } + + private void initialize(StratifiedEvalTestProvider cfg) { + VEwalker.createStratificationStates(cfg.stratificationObjects, cfg.evaluationObjects); + + final RefMetaDataTracker tracker = new RefMetaDataTracker(); + final ReferenceContext ref = null; + final VariantContext comp = null; + final String compName = null, sampleName = null; + + // increment eval counts for each stratification of divisors of i from from 1...maxI + for ( int i = 1; i <= cfg.maxI; i++ ) { + final String evalName = String.valueOf(i); // terrible hack to stratify by divisor + for ( EvaluationContext nec : VEwalker.getEvaluationContexts(tracker, ref, eval, evalName, comp, compName, sampleName) ) { + synchronized (nec) { + nec.apply(tracker, ref, null, comp, eval); + } + } + } + } + + @DataProvider(name = "StratifiedEvalTestProvider") + public Object[][] makeStratifiedEvalTestProvider() { + + new StratifiedEvalTestProvider(4, // test 1, 2, 3, 4 + Arrays.asList(4, 2), // 4 divisible by 1, 2 by 2 + Arrays.asList(1, 2)); + + new StratifiedEvalTestProvider(6, // test 1, 2, 3, 4, 5, 6 + Arrays.asList(6, 3, 2), // 6 divisible by 1, 3 by 2, 2 by 3 + Arrays.asList(1, 2, 3)); + + // test that some states can be empty -- does this work in VE? + new StratifiedEvalTestProvider(6, + Arrays.asList(3, 2), + Arrays.asList(2, 3)); + + // test a single stratification + new StratifiedEvalTestProvider(6, + Arrays.asList(3), + Arrays.asList(2)); + + // test a meaningless state + new StratifiedEvalTestProvider(4, // test 1, 2, 3, 4 + Arrays.asList(4, 2), // 4 divisible by 1, 2 by 2 + Arrays.asList(1, 2), Arrays.asList(1)); + + // test a adding a state that divides space in half + new StratifiedEvalTestProvider(4, + Arrays.asList(2, 2), + Arrays.asList(1, 2), Arrays.asList(2)); + + // test pairs of strats + new StratifiedEvalTestProvider(12, + Arrays.asList(4, 3, 2, 3), + Arrays.asList(1, 2), Arrays.asList(3, 4)); + + return StratifiedEvalTestProvider.getTests(StratifiedEvalTestProvider.class); + } + + /** + * Ensures that counting and stratifications all are working properly by iterating + * over integers 1...cfg.N and stratify according to cfg, and that the counts in + * each bin are as expected. + * + * @param cfg + */ + @Test(dataProvider = "StratifiedEvalTestProvider") + public void testBasicOperation(StratifiedEvalTestProvider cfg) { + initialize(cfg); + checkStratificationCountsAreExpected(VEwalker.stratManager, cfg.expectedCounts); + } + + private final void checkStratificationCountsAreExpected(final StratificationManager manager, + final List expectedCounts) { + for ( int key = 0; key < manager.size(); key++ ) { + final String stratStateString = manager.getStratsAndStatesStringForKey(key); + final EvaluationContext nec = manager.get(key); + + for ( final VariantEvaluator ve : nec.getVariantEvaluators() ) { + // test for count here + final CounterEval counterEval = (CounterEval)ve; + final int expected = expectedCounts.get(key); + Assert.assertEquals(counterEval.count, expected, "Count seen of " + counterEval.count + " not expected " + expected + " at " + stratStateString); + } + } + } + + /** + * A derived test on testBasicOperation that checks that combining stratifications + * works as expected by ensuring the results are the same when the remapped + * strats are the identity map (A -> A, B -> B, etc) + */ + @Test(dataProvider = "StratifiedEvalTestProvider", dependsOnMethods = {"testBasicOperation"}) + public void testIdentityCombine(StratifiedEvalTestProvider cfg) { + for ( int i = 0; i < cfg.stratificationObjects.size(); i++ ) { + initialize(cfg); + final VariantStratifier toReplace = cfg.stratificationObjects.get(i); + final VariantStratifier newStrat = cfg.stratificationObjects.get(i); + final Map remappedStates = Utils.makeIdentityFunctionMap(newStrat.getAllStates()); + StratificationManager combined = + VEwalker.stratManager.combineStrats(toReplace, newStrat, EvaluationContext.COMBINER, remappedStates); + checkStratificationCountsAreExpected(combined, cfg.expectedCounts); + } + } + +// /** +// * A derived test on testBasicOperation that checks that combining stratifications +// * works as expected. We look into cfg, and if there are multiple states we create +// * dynamically create a combinations of the stratifications, and ensure that the +// * combined results are as we expected. +// */ +// @Test(dataProvider = "StratifiedEvalTestProvider", dependsOnMethods = {"testBasicOperation"}) +// public void testCombinedEachStrat(StratifiedEvalTestProvider cfg) { +// for ( int i = 0; i < cfg.stratificationObjects.size(); i++ ) { +// initialize(cfg); +// final VariantStratifier toReplace = cfg.stratificationObjects.get(i); +// +// // TODO -- replace this code with something that combines values in strat +// final VariantStratifier newStrat = cfg.stratificationObjects.get(i); +// final Map remappedStates = Utils.makeIdentityFunctionMap(newStrat.getAllStates()); +// final List expected = cfg.expectedCounts; +// +// StratificationManager combined = +// VEwalker.stratManager.combineStrats(toReplace, newStrat, EvaluationContext.COMBINER, remappedStates); +// checkStratificationCountsAreExpected(combined, expected); +// } +// } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java new file mode 100644 index 000000000..2b6f5c712 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/manager/StratificationManagerUnitTest.java @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.manager; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.FileNotFoundException; +import java.util.*; + + +public class StratificationManagerUnitTest extends BaseTest { + @BeforeClass + public void init() throws FileNotFoundException { + } + + // -------------------------------------------------------------------------------- + // + // Basic tests Provider + // + // -------------------------------------------------------------------------------- + + private class StratificationStatesTestProvider extends TestDataProvider { + final List> allStates = new ArrayList>(); + final List asSetOfStates = new ArrayList(); + final int nStates; + + public StratificationStatesTestProvider(final List ... allStates) { + super(StratificationStatesTestProvider.class); + + for ( List states : allStates ) { + this.allStates.add(new ArrayList(states)); + } + + for ( List states : this.allStates ) { + asSetOfStates.add(new IntegerStratifier(states)); + } + this.nStates = Utils.nCombinations(allStates); + + setName(getName()); + } + + private String getName() { + StringBuilder b = new StringBuilder(); + int c = 1; + for ( List state : allStates ) + b.append(String.format("%d = [%s] ", c++, Utils.join(",", state))); + return b.toString(); + } + + public List getStateSpaceList() { + return asSetOfStates; + } + + public ArrayList values() { + final ArrayList l = new ArrayList(); + for ( int i = 0; i < nStates; i++ ) + l.add(i); + return l; + } + + public Queue> getAllCombinations() { + return getAllCombinations(new LinkedList>(allStates)); + } + + private Queue> getAllCombinations(Queue> states) { + if ( states.isEmpty() ) + return new LinkedList>(); + else { + List head = states.poll(); + Queue> substates = getAllCombinations(states); + Queue> newStates = new LinkedList>(); + for ( final Object e : head) { + if ( substates.isEmpty() ) { + newStates.add(new LinkedList(Collections.singleton(e))); + } else { + for ( final List state : substates ) { + List newState = new LinkedList(); + newState.add(e); + newState.addAll(state); + newStates.add(newState); + } + } + } + return newStates; + } + } + } + + private class IntegerStratifier implements Stratifier { + final List integers; + + private IntegerStratifier(final List integers) { + this.integers = integers; + } + + @Override + public List getAllStates() { + return integers; + } + } + + @DataProvider(name = "StratificationStatesTestProvider") + public Object[][] makeStratificationStatesTestProvider() { + new StratificationStatesTestProvider(Arrays.asList(0)); + new StratificationStatesTestProvider(Arrays.asList(0, 1)); + new StratificationStatesTestProvider(Arrays.asList(0, 1), Arrays.asList(2, 3)); + new StratificationStatesTestProvider(Arrays.asList(0, 1), Arrays.asList(2, 3), Arrays.asList(4, 5)); + new StratificationStatesTestProvider(Arrays.asList(0, 1), Arrays.asList(2, 3, 4), Arrays.asList(5, 6)); + new StratificationStatesTestProvider(Arrays.asList(0, 1), Arrays.asList(2, 3, 4, 5), Arrays.asList(6)); + new StratificationStatesTestProvider(Arrays.asList(0, 1), Arrays.asList(2, 3, 4, 5), Arrays.asList(6, 7)); + new StratificationStatesTestProvider(Arrays.asList(0, 1), Arrays.asList(2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7)); + return StratificationStatesTestProvider.getTests(StratificationStatesTestProvider.class); + } + + private final StratificationManager createManager(StratificationStatesTestProvider cfg) { + final StratificationManager manager = new StratificationManager(cfg.getStateSpaceList()); + List values = cfg.values(); + for ( int i = 0; i < cfg.nStates; i++ ) + manager.set(i, values.get(i)); + + Assert.assertEquals(manager.values(), values, "Values not equal"); + + return manager; + } + + @Test(dataProvider = "StratificationStatesTestProvider") + public void testLeafCount(StratificationStatesTestProvider cfg) { + final StratificationManager stratificationManager = createManager(cfg); + + Assert.assertEquals(stratificationManager.size(), cfg.nStates); + + int nLeafs = 0; + for ( final StratNode node : stratificationManager.getRoot() ) { + if ( node.isLeaf() ) + nLeafs++; + } + Assert.assertEquals(nLeafs, cfg.nStates, "Unexpected number of leaves"); + } + + @Test(dataProvider = "StratificationStatesTestProvider") + public void testKeys(StratificationStatesTestProvider cfg) { + final StratificationManager stratificationManager = createManager(cfg); + final Set seenKeys = new HashSet(cfg.nStates); + for ( final StratNode node : stratificationManager.getRoot() ) { + if ( node.isLeaf() ) { + Assert.assertFalse(seenKeys.contains(node.getKey()), "Already seen the key"); + seenKeys.add(node.getKey()); + } + } + } + + @Test(dataProvider = "StratificationStatesTestProvider") + public void testFindSingleKeys(StratificationStatesTestProvider cfg) { + final StratificationManager stratificationManager = createManager(cfg); + final Set seenKeys = new HashSet(cfg.nStates); + for ( List state : cfg.getAllCombinations() ) { + final int key = stratificationManager.getKey(state); + Assert.assertFalse(seenKeys.contains(key), "Already saw state mapping to this key"); + Assert.assertTrue(stratificationManager.containsKey(state)); + seenKeys.add(key); + + // test value + Assert.assertEquals(stratificationManager.get(key), cfg.values().get(key)); + Assert.assertEquals(stratificationManager.get(state), cfg.values().get(key)); + + state.set(0, 12345); // not present + Assert.assertEquals(stratificationManager.getKey(state), -1); + Assert.assertFalse(stratificationManager.containsKey(state)); + } + } + + @Test(dataProvider = "StratificationStatesTestProvider") + public void testFindMultipleKeys(StratificationStatesTestProvider cfg) { + final StratificationManager stratificationManager = createManager(cfg); + final List> states = new ArrayList>(cfg.allStates); + final Set keys = stratificationManager.getKeys(states); + Assert.assertEquals(keys.size(), cfg.nStates, "Find all states didn't find all of the expected unique keys"); + + final Queue> combinations = cfg.getAllCombinations(); + while ( ! combinations.isEmpty() ) { + List first = combinations.poll(); + List second = combinations.peek(); + if ( second != null ) { + List> combined = StratificationManager.combineStates(first, second); + int nExpectedKeys = Utils.nCombinations(combined); + + final int key1 = stratificationManager.getKey(first); + final int key2 = stratificationManager.getKey(second); + final Set keysCombined = stratificationManager.getKeys(combined); + + Assert.assertTrue(keysCombined.contains(key1), "couldn't find key in data set"); + Assert.assertTrue(keysCombined.contains(key2), "couldn't find key in data set"); + + Assert.assertEquals(keysCombined.size(), nExpectedKeys); + } + } + } + + @Test(dataProvider = "StratificationStatesTestProvider") + public void testMapSet(StratificationStatesTestProvider cfg) { + final StratificationManager stratificationManager = createManager(cfg); + stratificationManager.set(0, -1); + Assert.assertEquals((int)stratificationManager.get(0), -1); + } + + @Test(dataProvider = "StratificationStatesTestProvider") + public void testStratifierByKey(StratificationStatesTestProvider cfg) { + final StratificationManager manager = createManager(cfg); + for ( int key = 0; key < cfg.nStates; key++ ) { + List> stratsAndStates = manager.getStratsAndStatesForKey(key); + final List strats = manager.getStatesForKey(key); + Assert.assertEquals((int)manager.get(strats), key, "Key -> strats -> key failed to return same key"); + + for ( int i = 0; i < strats.size(); i++ ) { + Assert.assertEquals(stratsAndStates.get(i).getSecond(), strats.get(i), "Strats and StratsAndStates differ"); + } + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index c81891ac6..879a5bfa3 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -27,7 +27,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "0ddd1e0e483d2eaf56004615cea23ec7", // tranches - "58780f63182e139fdbe17f6c18b5b774", // recal file + "f8e21a1987960b950db1f0d98be45352", // recal file "f67d844b6252a55452cf4167b77530b1"); // cut VCF @DataProvider(name = "VRTest") @@ -73,9 +73,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", - "6d7ee4cb651c8b666e4a4523363caaff", // tranches - "4759b111a5aa53975d46e0f22c7983bf", // recal file - "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF + "da4458d05f6396f5c4ab96f274e5ccdc", // tranches + "cf380d9b0ae04c8918be8425f82035b4", // recal file + "b00e5e5a6807df8ed1682317948e8a6d"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createData2() { @@ -118,5 +118,21 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { Arrays.asList(params.cutVCFMD5)); executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); } + + @Test + public void testApplyRecalibrationSnpAndIndelTogether() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " -NO_HEADER" + + " -input " + validationDataLocation + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + validationDataLocation + "VQSR.mixedTest.tranches" + + " -recalFile " + validationDataLocation + "VQSR.mixedTest.recal", + Arrays.asList("08060b7f5c9cf3bb1692b50c58fd5a4b")); + executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 900e3d489..973588cf0 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -163,4 +163,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testParallelization (4 threads)--" + testfile, spec); } + + @Test + public void testSelectFromMultiAllelic() { + String testfile = validationDataLocation + "multi-allelic.bi-allelicInGIH.vcf"; + String samplesFile = validationDataLocation + "GIH.samples.list"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " -o %s -NO_HEADER -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + 1, + Arrays.asList("3fb50cc1c955491048108956d7087c35") + ); + executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 16b6c97d0..a3dae8432 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("addf5f4596ddacef40808f6d3d281111") + Arrays.asList("3212b375b8c440abe436be42ec7e1524") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java new file mode 100644 index 000000000..94e2dbf54 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/BitSetUtilsUnitTest.java @@ -0,0 +1,75 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Random; + +/** + * @author Mauricio Carneiro + * @since 3/5/12 + */ + +public class BitSetUtilsUnitTest { + private static int RANDOM_NUMBERS_TO_TRY = 87380; + private static Random random; + + @BeforeClass + public void init() { + random = GenomeAnalysisEngine.getRandomGenerator(); + } + + @Test(enabled = true) + public void testLongBitSet() { + long[] numbers = {0L, 1L, 428L, 65536L, 239847L, 4611686018427387903L, Long.MAX_VALUE, Long.MIN_VALUE, -1L, -2L, -7L, -128L, -65536L, -100000L}; + for (long n : numbers) + Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); + + for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) { + long n = random.nextLong(); + Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); // Because class Random uses a seed with only 48 bits, this algorithm will not return all possible long values. + } + } + + @Test(enabled = true) + public void testShortBitSet() { + short[] numbers = {0, 1, 428, 25934, 23847, 16168, Short.MAX_VALUE, Short.MIN_VALUE, -1, -2, -7, -128, -12312, -31432}; + for (long n : numbers) + Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n); + + for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) { + short n = (short) random.nextInt(); + Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n); + } + } + + @Test(enabled = true) + public void testDNAAndBitSetConversion() { + String[] dna = {"AGGTGTTGT", "CCCCCCCCCCCCCC", "GGGGGGGGGGGGGG", "TTTTTTTTTTTTTT", "GTAGACCGATCTCAGCTAGT", "AACGTCAATGCAGTCAAGTCAGACGTGGGTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"}; + + // Test all contexts of size 1-8. + for (long n = 0; n < RANDOM_NUMBERS_TO_TRY; n++) + Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(n)))), n); + + // Test the special cases listed in the dna array + for (String d : dna) + Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d); + } + + @Test(enabled = true) + public void testNumberOfBitsToRepresent() { + Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(0), 0); // Make sure 0 elements need 0 bits to be represented + Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(1), 1); // Make sure 1 element needs 1 bit to be represented + Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(3), 2); // Make sure 3 elements need 2 bit to be represented + + for (int i = 1; i < 63; i++) { // Can't test i == 63 because n1 is a negative number + long n1 = 1L << i; + long n2 = Math.abs(random.nextLong()) % n1; + long n3 = n1 | n2; + Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n3), (n3 == n1) ? i : i + 1); + Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n1), i); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java index 25bd7a2eb..87852f9ca 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -99,8 +99,7 @@ public class HaplotypeUnitTest extends BaseTest { h1CigarList.add(new CigarElement(10, CigarOperator.I)); h1CigarList.add(new CigarElement(8, CigarOperator.M)); h1CigarList.add(new CigarElement(3, CigarOperator.D)); - h1CigarList.add(new CigarElement(7, CigarOperator.M)); - h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(7 + 4, CigarOperator.M)); final Cigar h1Cigar = new Cigar(h1CigarList); String h1bases = "AACTTTCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; basicInsertTest("-", "ACTT", 1, h1Cigar, bases, h1bases); @@ -119,8 +118,7 @@ public class HaplotypeUnitTest extends BaseTest { h1CigarList.add(new CigarElement(10, CigarOperator.I)); h1CigarList.add(new CigarElement(8, CigarOperator.M)); h1CigarList.add(new CigarElement(3, CigarOperator.D)); - h1CigarList.add(new CigarElement(7, CigarOperator.M)); - h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(7 + 4, CigarOperator.M)); final Cigar h1Cigar = new Cigar(h1CigarList); String h1bases = "A" + "CGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; basicInsertTest("ACTT", "-", 1, h1Cigar, bases, h1bases); @@ -139,8 +137,7 @@ public class HaplotypeUnitTest extends BaseTest { h1CigarList.add(new CigarElement(10, CigarOperator.I)); h1CigarList.add(new CigarElement(8, CigarOperator.M)); h1CigarList.add(new CigarElement(3, CigarOperator.D)); - h1CigarList.add(new CigarElement(7, CigarOperator.M)); - h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(7 + 4, CigarOperator.M)); final Cigar h1Cigar = new Cigar(h1CigarList); String h1bases = "AGCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; basicInsertTest("T", "G", 1, h1Cigar, bases, h1bases); @@ -155,9 +152,10 @@ public class HaplotypeUnitTest extends BaseTest { final Haplotype h = new Haplotype(hap.getBytes()); final Allele h1refAllele = Allele.create(ref, true); final Allele h1altAllele = Allele.create(alt, false); - final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE, 0, cigar) ); + h.setAlignmentStartHapwrtRef(0); + h.setCigar(cigar); + final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE) ); final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 1ba6c74d4..04b0199d8 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils; - import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -131,7 +130,8 @@ public class MathUtilsUnitTest extends BaseTest { int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]); + for (int i = 0; i < numbers.length; i++) + r.add((double) numbers[i]); Assert.assertEquals((long) numbers.length, r.observationCount()); Assert.assertTrue(r.mean() - 3224.625 < 2e-10); @@ -223,37 +223,14 @@ public class MathUtilsUnitTest extends BaseTest { return set.isEmpty(); } - @Test(enabled = true) - public void testIntAndBitSetConversion() { - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428); - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847); - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726); - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0); - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1); - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536); - Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE); - } - - @Test(enabled = true) - public void testDNAAndBitSetConversion() { - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT"); - Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31) - } - @Test public void testApproximateLog10SumLog10() { + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); @@ -266,55 +243,94 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); } @Test public void testNormalizeFromLog10() { - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, -1.0, -1.1, -7.8})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, 0.0, -0.1, -6.8})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[]{-8.9, -6.7, -9.4, 0.0, -8.9})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, -1.0, -1.1, -7.8})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, 0.0, -0.1, -6.8})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[] {-8.9, -6.7, -9.4, 0.0, -8.9})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.0}), new double[]{0.25, 0.25, 0.25, 0.25})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -1.0}), new double[]{0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301})); - Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -2.0}), new double[]{0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.0}), new double[] {0.25, 0.25, 0.25, 0.25})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -1.0}), new double[] {0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); + } + + @Test + public void testLog10sumLog10() { + final double log3 = 0.477121254719662; + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3), 0); + + final double log2 = 0.301029995663981; + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0), 0); + } + + @Test + public void testDotProduct() { + Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3); + Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0}, new Double[]{6.0}),-30.0,1e-3); + } + + @Test + public void testLogDotProduct() { + Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0,-3.0,2.0}, new double[]{6.0,7.0,8.0}),10.0,1e-3); + Assert.assertEquals(MathUtils.logDotProduct(new double[]{-5.0}, new double[]{6.0}),1.0,1e-3); } /** * Private function used by testNormalizeFromLog10() */ private boolean compareDoubleArrays(double[] b1, double[] b2) { - if( b1.length != b2.length ) { + if (b1.length != b2.length) { return false; // sanity check } - for( int i=0; i < b1.length; i++ ){ - if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 ) + for (int i = 0; i < b1.length; i++) { + if (MathUtils.compareDoubles(b1[i], b2[i]) != 0) return false; } return true; diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java new file mode 100644 index 000000000..22bcb1bbf --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class PairHMMUnitTest extends BaseTest { + final static boolean EXTENSIVE_TESTING = true; + PairHMM hmm = new PairHMM( false ); // reference implementation + PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class BasicLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String CONTEXT = "ACGTAATGACGATTGCA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BasicLikelihoodTestProvider.class, String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + } + + public double expectedLogL() { + return expectedQual / -10.0; + } + + public double tolerance() { + return 0.1; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), + qualAsBytes(gcp, false)); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); + + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; + } + + return phredQuals; + } + } + + final Random random = new Random(87865573); + private class BandedLikelihoodTestProvider extends TestDataProvider { + final String ref, read; + final byte[] refBasesWithContext, readBasesWithContext; + final int baseQual, insQual, delQual, gcp; + final int expectedQual; + final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; + final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; + final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; + final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; + final byte[] baseQuals, insQuals, delQuals, gcps; + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + } + + public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); + this.baseQual = baseQual; + this.delQual = delQual; + this.insQual = insQual; + this.gcp = gcp; + this.read = read; + this.ref = ref; + this.expectedQual = expectedQual; + + refBasesWithContext = asBytes(ref, left, right); + readBasesWithContext = asBytes(read, false, false); + baseQuals = qualAsBytes(baseQual); + insQuals = qualAsBytes(insQual); + delQuals = qualAsBytes(delQual); + gcps = qualAsBytes(gcp, false); + } + + public double expectedLogL() { + double logL = hmm.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + public double tolerance() { + return 0.2; // TODO FIXME arbitrary + } + + public double calcLogL() { + + double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( + refBasesWithContext, readBasesWithContext, + baseQuals, insQuals, delQuals, gcps); + + return logL; + } + + private final byte[] asBytes(final String bases, final boolean left, final boolean right) { + return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + } + + private byte[] qualAsBytes(final int phredQual) { + return qualAsBytes(phredQual, true); + } + + private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { + final byte phredQuals[] = new byte[readBasesWithContext.length]; + Arrays.fill(phredQuals, (byte)phredQual); + if(addRandom) { + for( int iii = 0; iii < phredQuals.length; iii++) { + phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + } + } + return phredQuals; + } + } + + @DataProvider(name = "BasicLikelihoodTestProvider") + public Object[][] makeBasicLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @DataProvider(name = "BandedLikelihoodTestProvider") + public Object[][] makeBandedLikelihoodTests() { + // context on either side is ACGTTGCA REF ACGTTGCA + // test all combinations + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + + for ( final int baseQual : baseQuals ) { + for ( final int indelQual : indelQuals ) { + for ( final int gcp : gcps ) { + + // test substitutions + for ( final byte refBase : BaseUtils.BASES ) { + for ( final byte readBase : BaseUtils.BASES ) { + final String ref = new String(new byte[]{refBase}); + final String read = new String(new byte[]{readBase}); + final int expected = refBase == readBase ? 0 : baseQual; + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + } + } + + // test insertions and deletions + for ( final int size : sizes ) { + for ( final byte base : BaseUtils.BASES ) { + final int expected = indelQual + (size - 2) * gcp; + + for ( boolean insertionP : Arrays.asList(true, false)) { + final String small = Utils.dupString((char)base, 1); + final String big = Utils.dupString((char)base, size); + + final String ref = insertionP ? small : big; + final String read = insertionP ? big : small; + + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); + new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + } + } + } + } + } + } + + return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + } + + @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) + public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { + double calculatedLogL = cfg.calcLogL(); + double expectedLogL = cfg.expectedLogL(); + logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + } + + @Test + public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { + byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + + final int offset = 2; + byte[] gop = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gop, (byte) 80); + byte[] gcp = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(gcp, (byte) 80); + + for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) { + byte[] quals = new byte[haplotype1.length - 2 * offset]; + Arrays.fill(quals, (byte) 90); + // one read mismatches the haplotype + quals[k] = 20; + + byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); + // change single base at position k to C. If it's a C, change to T + mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); + double res1 = hmm.computeReadLikelihoodGivenHaplotype( + haplotype1, mread, + quals, gop, gop, + gcp); + + + System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); + + Assert.assertEquals(res1, -2.0, 1e-2); + } + } + + @Test + public void testMismatchInEveryPositionInTheRead() { + byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); + + final int offset = 2; + byte[] gop = new byte[haplotype1.length - offset]; + Arrays.fill(gop, (byte) 80); + byte[] gcp = new byte[haplotype1.length - offset]; + Arrays.fill(gcp, (byte) 80); + + for( int k = 0; k < haplotype1.length - offset; k++ ) { + byte[] quals = new byte[haplotype1.length - offset]; + Arrays.fill(quals, (byte) 90); + // one read mismatches the haplotype + quals[k] = 20; + + byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); + // change single base at position k to C. If it's a C, change to T + mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); + double res1 = hmm.computeReadLikelihoodGivenHaplotype( + haplotype1, mread, + quals, gop, gop, + gcp); + + + System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); + + Assert.assertEquals(res1, -2.0, 1e-2); + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java new file mode 100644 index 000000000..18a214950 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/QualityUtilsUnitTest.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 3/21/12 + */ + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for QualityUtils class + */ +public class QualityUtilsUnitTest extends BaseTest { + @BeforeClass + public void init() { + } + + @Test + public void testQualCaches() { + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6); + + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6); + + Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6); + Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6); + Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6); + Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java new file mode 100644 index 000000000..23bf074e2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/R/RUtilsUnitTest.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class RUtilsUnitTest { + @DataProvider(name = "stringLists") + public Object[][] getStringLists() { + return new Object[][] { + new Object[] { null, "NA" }, + new Object[] { Collections.EMPTY_LIST, "c()" }, + new Object[] { Arrays.asList("1", "2", "3"), "c('1','2','3')" } + }; + } + + @Test(dataProvider = "stringLists") + public void testToStringList(List actual, String expected) { + Assert.assertEquals(RUtils.toStringList(actual), expected); + } + + @DataProvider(name = "numberLists") + public Object[][] getNumberLists() { + return new Object[][] { + new Object[] { null, "NA" }, + new Object[] { Collections.EMPTY_LIST, "c()" }, + new Object[] { Arrays.asList(1, 2, 3), "c(1,2,3)" }, + new Object[] { Arrays.asList(1D, 2D, 3D), "c(1.0,2.0,3.0)" } + }; + } + + @Test(dataProvider = "numberLists") + public void testToNumberList(List actual, String expected) { + Assert.assertEquals(RUtils.toNumberList(actual), expected); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java new file mode 100644 index 000000000..282f19d8a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils.activeregion; + + +// the imports for unit testing. + + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class ActivityProfileUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + private GenomeLoc startLoc; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + startLoc = genomeLocParser.createGenomeLoc("chr1", 1, 1, 100); + } + + // -------------------------------------------------------------------------------- + // + // Basic tests Provider + // + // -------------------------------------------------------------------------------- + + private class BasicActivityProfileTestProvider extends TestDataProvider { + List probs; + List expectedRegions; + int extension = 0; + GenomeLoc regionStart = startLoc; + + public BasicActivityProfileTestProvider(final List probs, final List expectedRegions) { + super(BasicActivityProfileTestProvider.class); + this.probs = probs; + this.expectedRegions = expectedRegions; + setName(getName()); + } + + public BasicActivityProfileTestProvider(final List probs, boolean startActive, int ... startsAndStops) { + super(BasicActivityProfileTestProvider.class); + this.probs = probs; + this.expectedRegions = toRegions(startActive, startsAndStops); + setName(getName()); + } + + private String getName() { + return String.format("probs=%s expectedRegions=%s", Utils.join(",", probs), Utils.join(",", expectedRegions)); + } + + private List toRegions(boolean isActive, int[] startsAndStops) { + List l = new ArrayList(); + for ( int i = 0; i < startsAndStops.length - 1; i++) { + int start = regionStart.getStart() + startsAndStops[i]; + int end = regionStart.getStart() + startsAndStops[i+1] - 1; + GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end); + ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension); + l.add(r); + isActive = ! isActive; + } + return l; + } + } + + @DataProvider(name = "BasicActivityProfileTestProvider") + public Object[][] makeQualIntervalTestProvider() { + new BasicActivityProfileTestProvider(Arrays.asList(1.0), true, 0, 1); + new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0), true, 0, 1, 2); + new BasicActivityProfileTestProvider(Arrays.asList(0.0, 1.0), false, 0, 1, 2); + new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); + new BasicActivityProfileTestProvider(Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); + + return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); + } + + @Test(dataProvider = "BasicActivityProfileTestProvider") + public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) { + ActivityProfile profile = new ActivityProfile(genomeLocParser, false); + + Assert.assertEquals(profile.parser, genomeLocParser); + + for ( int i = 0; i < cfg.probs.size(); i++ ) { + double p = cfg.probs.get(i); + GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); + profile.add(loc, p); + } + Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); + + Assert.assertEquals(profile.size(), cfg.probs.size()); + Assert.assertEquals(profile.isActiveList, cfg.probs); + + assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); + } + + private void assertRegionsAreEqual(List actual, List expected) { + Assert.assertEquals(actual.size(), expected.size()); + for ( int i = 0; i < actual.size(); i++ ) { + Assert.assertTrue(actual.get(i).equalExceptReads(expected.get(i))); + } + } + + // todo -- test extensions +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java index 7681ed7d1..e0fb1b876 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFCodecUnitTest.java @@ -85,7 +85,7 @@ public class VCFCodecUnitTest extends BaseTest { @Test(dataProvider = "AlleleClippingTestProvider") public void TestAlleleClipping(AlleleClippingTestProvider cfg) { - int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref, 0, 1); + int result = AbstractVCFCodec.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false, 1); Assert.assertEquals(result, cfg.expectedClip); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index a8364419d..756966e97 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -229,7 +229,7 @@ public class IntervalIntegrationTest extends WalkerTest { @Test(enabled = true) public void testEmptyVCF() { - String md5 = ""; + String md5 = "897316929176464ebc9ad085f31e7284"; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T CountLoci" + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + @@ -238,12 +238,12 @@ public class IntervalIntegrationTest extends WalkerTest { " -L " + validationDataLocation + "intervalTest.empty.vcf", 1, // just one output file Arrays.asList(md5)); - executeTest("testEmptyVCFError", spec); + executeTest("testEmptyVCFWarning", spec); } @Test(enabled = true) public void testIncludeExcludeIsTheSame() { - String md5 = ""; + String md5 = "897316929176464ebc9ad085f31e7284"; WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T CountLoci" + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + @@ -256,5 +256,17 @@ public class IntervalIntegrationTest extends WalkerTest { executeTest("testIncludeExcludeIsTheSame", spec); } - + @Test(enabled = true) + public void testSymbolicAlleles() { + String md5 = "52745056d2fd5904857bbd4984c08098"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" + + " -R " + b36KGReference + + " -o %s" + + " -L " + validationDataLocation + "symbolic_alleles_1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testSymbolicAlleles", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java new file mode 100644 index 000000000..0026a2b6a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/BaseRecalibrationUnitTest.java @@ -0,0 +1,290 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.*; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Unit tests for on-the-fly recalibration. + * + * @author Mauricio Carneiro + * @since 3/16/12 + */ +public class BaseRecalibrationUnitTest { + + private org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager dataManager; + private LinkedHashMap> keysAndTablesMap; + + private ReadGroupCovariate rgCovariate; + private QualityScoreCovariate qsCovariate; + private ContextCovariate cxCovariate; + private CycleCovariate cyCovariate; + + private GATKSAMRecord read = ReadUtils.createRandomRead(10000); + private BaseRecalibration baseRecalibration; + private ReadCovariates readCovariates; + + + @BeforeClass + public void init() { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg"); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + + byte[] quals = new byte[read.getReadLength()]; + for (int i = 0; i < read.getReadLength(); i++) + quals[i] = 20; + read.setBaseQualities(quals); + + RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + List requiredCovariates = new ArrayList(); + List optionalCovariates = new ArrayList(); + ArrayList requestedCovariates = new ArrayList(); + + dataManager = new org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager(true, 4); + keysAndTablesMap = new LinkedHashMap>(); + + rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + BQSRKeyManager rgKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(rgKeyManager, new HashMap()); + + qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + BQSRKeyManager qsKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(qsKeyManager, new HashMap()); + + cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + BQSRKeyManager cvKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); + keysAndTablesMap.put(cvKeyManager, new HashMap()); + + + for (Covariate cov : requiredCovariates) + requestedCovariates.add(cov); + for (Covariate cov : optionalCovariates) + requestedCovariates.add(cov); + + readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); + + for (int i=0; i> mapEntry : keysAndTablesMap.entrySet()) { + List keys = mapEntry.getKey().bitSetsFromAllKeys(bitKeys, EventType.BASE_SUBSTITUTION); + for (BitSet key : keys) + updateCovariateWithKeySet(mapEntry.getValue(), key, newDatum); + } + } + dataManager.generateEmpiricalQualities(1, QualityUtils.MAX_RECALIBRATED_Q_SCORE); + + List quantizedQuals = new ArrayList(); + List qualCounts = new ArrayList(); + for (byte i = 0; i <= QualityUtils.MAX_QUAL_SCORE; i++) { + quantizedQuals.add(i); + qualCounts.add(1L); + } + QuantizationInfo quantizationInfo = new QuantizationInfo(quantizedQuals, qualCounts); + quantizationInfo.noQuantization(); + baseRecalibration = new BaseRecalibration(quantizationInfo, keysAndTablesMap, requestedCovariates); + + } + + + @Test(enabled=false) + public void testGoldStandardComparison() { + debugTables(); + for (int i = 0; i < read.getReadLength(); i++) { + BitSet [] bitKey = readCovariates.getKeySet(i, EventType.BASE_SUBSTITUTION); + Object [] objKey = buildObjectKey(bitKey); + byte v2 = baseRecalibration.performSequentialQualityCalculation(bitKey, EventType.BASE_SUBSTITUTION); + byte v1 = goldStandardSequentialCalculation(objKey); + Assert.assertEquals(v2, v1); + } + } + + private Object[] buildObjectKey(BitSet[] bitKey) { + Object[] key = new Object[bitKey.length]; + key[0] = rgCovariate.keyFromBitSet(bitKey[0]); + key[1] = qsCovariate.keyFromBitSet(bitKey[1]); + key[2] = cxCovariate.keyFromBitSet(bitKey[2]); + key[3] = cyCovariate.keyFromBitSet(bitKey[3]); + return key; + } + + private void debugTables() { + System.out.println("\nV1 Table\n"); + System.out.println("ReadGroup Table:"); + NestedHashMap nestedTable = dataManager.getCollapsedTable(0); + printNestedHashMap(nestedTable.data, ""); + System.out.println("\nQualityScore Table:"); + nestedTable = dataManager.getCollapsedTable(1); + printNestedHashMap(nestedTable.data, ""); + System.out.println("\nCovariates Table:"); + nestedTable = dataManager.getCollapsedTable(2); + printNestedHashMap(nestedTable.data, ""); + nestedTable = dataManager.getCollapsedTable(3); + printNestedHashMap(nestedTable.data, ""); + + + int i = 0; + System.out.println("\nV2 Table\n"); + for (Map.Entry> mapEntry : keysAndTablesMap.entrySet()) { + BQSRKeyManager keyManager = mapEntry.getKey(); + Map table = mapEntry.getValue(); + switch(i++) { + case 0 : + System.out.println("ReadGroup Table:"); + break; + case 1 : + System.out.println("QualityScore Table:"); + break; + case 2 : + System.out.println("Covariates Table:"); + break; + } + for (Map.Entry entry : table.entrySet()) { + BitSet key = entry.getKey(); + RecalDatum datum = entry.getValue(); + List keySet = keyManager.keySetFrom(key); + System.out.println(String.format("%s => %s", Utils.join(",", keySet), datum) + "," + datum.getEstimatedQReported()); + } + System.out.println(); + } + + + } + + private static void printNestedHashMap(Map table, String output) { + for (Object key : table.keySet()) { + String ret; + if (output.isEmpty()) + ret = "" + key; + else + ret = output + "," + key; + + Object next = table.get(key); + if (next instanceof org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) + System.out.println(ret + " => " + next); + else + printNestedHashMap((Map) next, "" + ret); + } + } + + private void updateCovariateWithKeySet(final Map recalTable, final BitSet hashKey, final RecalDatum datum) { + RecalDatum previousDatum = recalTable.get(hashKey); // using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + if (previousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + recalTable.put(hashKey, datum.copy()); + else + previousDatum.combine(datum); // add one to the number of observations and potentially one to the number of mismatches + } + + /** + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte + */ + private byte goldStandardSequentialCalculation(final Object... key) { + + final byte qualFromRead = (byte) Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum globalRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey)); + double globalDeltaQ = 0.0; + if (globalRecalDatum != null) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum qReportedRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey)); + double deltaQReported = 0.0; + if (qReportedRecalDatum != null) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for (int iii = 2; iii < key.length; iii++) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum covariateRecalDatum = ((org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey)); + if (covariateRecalDatum != null) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported)); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_RECALIBRATED_Q_SCORE); + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + //return newQualityByte; + } + + public static double calcEmpiricalQual(final int observations, final int errors) { + final int smoothing = 1; + final double doubleMismatches = (double) (errors + smoothing); + final double doubleObservations = (double) ( observations + smoothing ); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + return Math.min(QualityUtils.MAX_RECALIBRATED_Q_SCORE, empiricalQual); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java new file mode 100644 index 000000000..5a8582fb2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import junit.framework.Assert; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class AlignmentUtilsUnitTest { + private SAMFileHeader header; + + /** Basic aligned and mapped read. */ + private SAMRecord readMapped; + + /** Read with no contig specified in the read, -L UNMAPPED */ + private SAMRecord readNoReference; + + /** This read has a start position, but is flagged that it's not mapped. */ + private SAMRecord readUnmappedFlag; + + /** This read says it's aligned, but to a contig not in the header. */ + private SAMRecord readUnknownContig; + + /** This read says it's aligned, but actually has an unknown start. */ + private SAMRecord readUnknownStart; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, ArtificialSAMUtils.DEFAULT_READ_LENGTH * 2); + + readMapped = createMappedRead("mapped", 1); + + readNoReference = createUnmappedRead("unmappedNoReference"); + + readUnmappedFlag = createMappedRead("unmappedFlagged", 2); + readUnmappedFlag.setReadUnmappedFlag(true); + + readUnknownContig = createMappedRead("unknownContig", 3); + readUnknownContig.setReferenceName("unknownContig"); + + readUnknownStart = createMappedRead("unknownStart", 1); + readUnknownStart.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + } + + /** + * Test for -L UNMAPPED + */ + @DataProvider(name = "genomeLocUnmappedReadTests") + public Object[][] getGenomeLocUnmappedReadTests() { + return new Object[][] { + new Object[] {readNoReference, true}, + new Object[] {readMapped, false}, + new Object[] {readUnmappedFlag, false}, + new Object[] {readUnknownContig, false}, + new Object[] {readUnknownStart, false} + }; + } + @Test(dataProvider = "genomeLocUnmappedReadTests") + public void testIsReadGenomeLocUnmapped(SAMRecord read, boolean expected) { + Assert.assertEquals(AlignmentUtils.isReadGenomeLocUnmapped(read), expected); + } + + /** + * Test for read being truly unmapped + */ + @DataProvider(name = "unmappedReadTests") + public Object[][] getUnmappedReadTests() { + return new Object[][] { + new Object[] {readNoReference, true}, + new Object[] {readMapped, false}, + new Object[] {readUnmappedFlag, true}, + new Object[] {readUnknownContig, false}, + new Object[] {readUnknownStart, true} + }; + } + @Test(dataProvider = "unmappedReadTests") + public void testIsReadUnmapped(SAMRecord read, boolean expected) { + Assert.assertEquals(AlignmentUtils.isReadUnmapped(read), expected); + } + + private SAMRecord createUnmappedRead(String name) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } + + private SAMRecord createMappedRead(String name, int start) { + return ArtificialSAMUtils.createArtificialRead( + header, + name, + 0, + start, + ArtificialSAMUtils.DEFAULT_READ_LENGTH); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 520fb7040..5946e38ea 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest { @Test public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0, false, false, false, false); - PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false); + PileupElement readp = new PileupElement(read, 0, false, false, false, false, false, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false, false, false); Assert.assertFalse(readp.getRead().isReducedRead()); diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java index f0b1de6fe..f21b4bced 100644 --- a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java @@ -28,17 +28,14 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ParsingEngine; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.testng.Assert; -import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.List; - +import java.util.*; /** * Tests selected functionality in the CommandLineExecutable class @@ -74,6 +71,76 @@ public class ListFileUtilsUnitTest extends BaseTest { performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } + @Test + public void testUnpackSet() throws Exception { + Set expected = new HashSet(Arrays.asList("public/testdata/exampleBAM.bam")); + Set actual; + + actual = ListFileUtils.unpackSet(Arrays.asList("public/testdata/exampleBAM.bam")); + Assert.assertEquals(actual, expected); + + File tempListFile = createTempListFile("testUnpackSet", + "#", + "public/testdata/exampleBAM.bam", + "#public/testdata/foo.bam", + " # public/testdata/bar.bam" + ); + actual = ListFileUtils.unpackSet(Arrays.asList(tempListFile.getAbsolutePath())); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="includeMatchingTests") + public Object[][] getIncludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("a", "ab") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, asSet("a", "ab", "abc") } + }; + } + + @Test(dataProvider = "includeMatchingTests") + public void testIncludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.includeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="excludeMatchingTests") + public Object[][] getExcludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, Collections.EMPTY_SET } + }; + } + + @Test(dataProvider = "excludeMatchingTests") + public void testExcludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.excludeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + private static Set asSet(T... args){ + return new HashSet(Arrays.asList(args)); + } + private File createTempListFile( String tempFilePrefix, String... lines ) throws Exception { File tempListFile = File.createTempFile(tempFilePrefix, ".list"); tempListFile.deleteOnExit(); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 638fd2531..531626540 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -30,6 +30,7 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -66,7 +67,7 @@ public class GenotypeLikelihoodsUnitTest { Assert.assertEquals(gl.getAsString(), vPLString); } - @Test (expectedExceptions = NumberFormatException.class) + @Test (expectedExceptions = UserException.MalformedVCF.class) public void testErrorBadFormat() { GenotypeLikelihoods gl = new GenotypeLikelihoods("adf,b,c"); gl.getAsVector(); @@ -95,6 +96,17 @@ public class GenotypeLikelihoodsUnitTest { } + @Test + public void testCalculateNumLikelihoods() { + + for (int nAlleles=2; nAlleles<=5; nAlleles++) + // simplest case: diploid + Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2); + + // some special cases: ploidy = 20, #alleles = 4 + Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(4, 20), 1771); + } + @Test public void testGetLog10GQ(){ GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index 0e75eee14..0a7427df7 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -236,6 +236,16 @@ public class VariantContextUnitTest extends BaseTest { Assert.assertEquals(vc.getSampleNames().size(), 0); } + @Test + public void testMatchingAlleles() { + List alleles = Arrays.asList(ATCref, del); + VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).referenceBaseForIndel((byte)'A').make(); + VariantContext vc2 = new VariantContextBuilder("test2", delLoc, delLocStart+12, delLocStop+12, alleles).referenceBaseForIndel((byte)'A').make(); + + Assert.assertTrue(vc.hasSameAllelesAs(vc2)); + Assert.assertTrue(vc.hasSameAlternateAllelesAs(vc2)); + } + @Test public void testCreatingInsertionVariantContext() { List alleles = Arrays.asList(delRef, ATC); @@ -458,6 +468,28 @@ public class VariantContextUnitTest extends BaseTest { } @Test + public void testGetGenotypeCounts() { + List alleles = Arrays.asList(Aref, T); + Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); + Genotype g2 = new Genotype("AT", Arrays.asList(Aref, T)); + Genotype g3 = new Genotype("TT", Arrays.asList(T, T)); + Genotype g4 = new Genotype("A.", Arrays.asList(Aref, Allele.NO_CALL)); + Genotype g5 = new Genotype("..", Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + + // we need to create a new VariantContext each time + VariantContext vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); + Assert.assertEquals(1, vc.getHetCount()); + vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); + Assert.assertEquals(1, vc.getHomRefCount()); + vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); + Assert.assertEquals(1, vc.getHomVarCount()); + vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); + Assert.assertEquals(1, vc.getMixedCount()); + vc = new VariantContextBuilder("foo", snpLoc, snpLocStart, snpLocStop, alleles).genotypes(g1,g2,g3,g4,g5).make(); + Assert.assertEquals(1, vc.getNoCallCount()); + } + + @Test public void testVCFfromGenotypes() { List alleles = Arrays.asList(Aref, T, del); Genotype g1 = new Genotype("AA", Arrays.asList(Aref, Aref)); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index ccf560f83..107241beb 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -589,4 +589,76 @@ public class VariantContextUtilsUnitTest extends BaseTest { return priority; } + + + // -------------------------------------------------------------------------------- + // + // Test repeats + // + // -------------------------------------------------------------------------------- + + private class RepeatDetectorTest extends TestDataProvider { + String ref; + boolean isTrueRepeat; + VariantContext vc; + + private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { + super(RepeatDetectorTest.class); + this.ref = "N" + ref; // add a dummy base for the event here + this.isTrueRepeat = isTrueRepeat; + + List alleles = new LinkedList(); + final Allele refAllele = Allele.create(refAlleleString, true); + alleles.add(refAllele); + for ( final String altString: altAlleleStrings) { + final Allele alt = Allele.create(altString, false); + alleles.add(alt); + } + + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles); + this.vc = builder.make(); + } + + public String toString() { + return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); + } + } + + @DataProvider(name = "RepeatDetectorTest") + public Object[][] makeRepeatDetectorTest() { + new RepeatDetectorTest(true, "AAC", "-", "A"); + new RepeatDetectorTest(true, "AAC", "A", "-"); + new RepeatDetectorTest(false, "AAC", "AA", "-"); + new RepeatDetectorTest(false, "AAC", "-", "C"); + new RepeatDetectorTest(false, "AAC", "A", "C"); + + // running out of ref bases => false + new RepeatDetectorTest(false, "AAC", "-", "CAGTA"); + + // complex repeats + new RepeatDetectorTest(true, "ATATATC", "-", "AT"); + new RepeatDetectorTest(true, "ATATATC", "-", "ATA"); + new RepeatDetectorTest(true, "ATATATC", "-", "ATAT"); + new RepeatDetectorTest(true, "ATATATC", "AT", "-"); + new RepeatDetectorTest(false, "ATATATC", "ATA", "-"); + new RepeatDetectorTest(false, "ATATATC", "ATAT", "-"); + + // multi-allelic + new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATAT"); + new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATA"); + new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATAT"); + new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATA"); // two As + new RepeatDetectorTest(false, "ATATATC", "AT", "-", "ATC"); // false + new RepeatDetectorTest(false, "ATATATC", "AT", "-", "CC"); // false + new RepeatDetectorTest(false, "ATATATC", "AT", "ATAT", "CC"); // false + + return RepeatDetectorTest.getTests(RepeatDetectorTest.class); + } + + @Test(dataProvider = "RepeatDetectorTest") + public void testRepeatDetectorTest(RepeatDetectorTest cfg) { + + // test alleles are equal + Assert.assertEquals(VariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); + } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 22ac52453..3dc953361 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -81,7 +81,7 @@ class GATKResourcesBundle extends QScript { def initializeTestDataFiles() = { // - // Standard evaluation files for indels + // Standard evaluation files for indel // b37 = new Reference("b37", new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")) hg18 = new Reference("hg18", new File("/Users/depristo/Desktop/broadLocal/localData/Homo_sapiens_assembly18.fasta")) @@ -122,8 +122,8 @@ class GATKResourcesBundle extends QScript { // // standard VCF files. Will be lifted to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf", - "dbsnp_132", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_135_b37.leftAligned.vcf", + "dbsnp_135", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_genotypes_1525_samples.b37.vcf", "1000G_omni2.5", b37, true, true)) @@ -131,8 +131,8 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", "hapmap_3.3", b37, true, true)) - addResource(new Resource("/humgen/1kg/processing/official_release/phase1/ALL.wgs.VQSR_consensus_biallelic.20101123.indels.sites.vcf", - "1000G_biallelic.indels", b37, true, false)) + addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", + "1000G_phase1.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", "Mills_and_1000G_gold_standard.indels", b37, true, true)) @@ -242,6 +242,7 @@ class GATKResourcesBundle extends QScript { def createDownloadsFromBundle(in: File, out: File) { Console.printf("Visiting %s%n", in) + // todo -- ignore some of the other files too (e.g. *.out); will test next time we make a bundle if (! in.getName.startsWith(".")) { if ( in.isDirectory ) { out.mkdirs @@ -320,7 +321,7 @@ class GATKResourcesBundle extends QScript { } class LiftOverPerl(@Input val in: File, @Output val out: File, @Input val chain: File, oldRef: Reference, newRef: Reference) extends CommandLineFunction { - this.memoryLimit = 8 + this.memoryLimit = 12 def commandLine = ("%s -vcf %s -chain %s -out %s " + "-gatk ./ -newRef %s -oldRef %s -tmp %s").format(liftOverPerl, in.getAbsolutePath, chain, out.getAbsolutePath, newRef.file.replace(".fasta", ""), diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala new file mode 100644 index 000000000..89f2f55fb --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.qscripts.examples + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ + +/** + * Script used for testing output to /dev/null + */ +class ExampleReadFilter extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + def script() { + val genotyper = new UnifiedGenotyper with BadMate + genotyper.reference_sequence = referenceFile + genotyper.memoryLimit = 2 + genotyper.input_file :+= bamFile + add(genotyper) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index 085e0b008..2f604a809 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -49,7 +49,6 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) { else IntervalUtils.parseIntervalArguments(parser, intervals) Collections.sort(parsedLocs) - Collections.unmodifiableList(parsedLocs) val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY) Collections.unmodifiableList(mergedLocs) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 70046c913..8ac711f25 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -32,6 +32,8 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor * Merges a vcf text file. */ class VcfGatherFunction extends CombineVariants with GatherFunction { + this.assumeIdenticalSamples = true + this.suppressCommandLineHeader = true private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] @@ -43,7 +45,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput - this.assumeIdenticalSamples = true // NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index f0feb207b..9d51b01a0 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -136,7 +136,7 @@ object PipelineTest extends BaseTest with Logging { println(" value (min,target,max) table key metric") for (validation <- evalSpec.validations) { val table = report.getTable(validation.table) - val key = table.getPrimaryKey(validation.key) + val key = table.getPrimaryKeyByData(validation.table +: validation.key.split('.') : _*) val value = String.valueOf(table.get(key, validation.metric)) val inRange = if (value == null) false else validation.inRange(value) val flag = if (!inRange) "*" else " " diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala new file mode 100644 index 000000000..7e5e9a93e --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleReadFilterPipelineTest { + @Test + def testExampleReadFilter() { + val spec = new PipelineTestSpec + spec.name = "examplereadfilter" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleReadFilter.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + PipelineTest.executeTest(spec) + } +} diff --git a/public/testdata/ac0.vcf b/public/testdata/ac0.vcf new file mode 100644 index 000000000..0f50d7a72 --- /dev/null +++ b/public/testdata/ac0.vcf @@ -0,0 +1,116 @@ +##fileformat=VCFv4.1 +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##SelectVariants="analysis_type=SelectVariants input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=[20:81006] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=/humgen/1kg/releases/main_project_phaseI/ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz) discordance=(RodBinding name= source=UNBOUND) concordance=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sample_name=[] sample_expressions=null sample_file=null exclude_sample_name=[] exclude_sample_file=[] select_expressions=[] excludeNonVariants=false excludeFiltered=false restrictAllelesTo=ALL keepOriginalAC=false mendelianViolation=false mendelianViolationQualThreshold=0.0 select_random_number=0 select_random_fraction=0.0 remove_fraction_genotypes=0.0 selectTypeToInclude=[] keepIDs=null outMVFile=null filter_mismatching_base_and_quals=false" +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta +##source=SelectVariants +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 HG00103 HG00104 HG00106 HG00108 HG00109 HG00110 HG00111 HG00112 HG00113 HG00114 HG00116 HG00117 HG00118 HG00119 HG00120 HG00121 HG00122 HG00123 HG00124 HG00125 HG00126 HG00127 HG00128 HG00129 HG00130 HG00131 HG00133 HG00134 HG00135 HG00136 HG00137 HG00138 HG00139 HG00140 HG00141 HG00142 HG00143 HG00146 HG00148 HG00149 HG00150 HG00151 HG00152 HG00154 HG00155 HG00156 HG00158 HG00159 HG00160 HG00171 HG00173 HG00174 HG00176 HG00177 HG00178 HG00179 HG00180 HG00182 HG00183 HG00185 HG00186 HG00187 HG00188 HG00189 HG00190 HG00231 HG00232 HG00233 HG00234 HG00235 HG00236 HG00237 HG00238 HG00239 HG00240 HG00242 HG00243 HG00244 HG00245 HG00246 HG00247 HG00249 HG00250 HG00251 HG00252 HG00253 HG00254 HG00255 HG00256 HG00257 HG00258 HG00259 HG00260 HG00261 HG00262 HG00263 HG00264 HG00265 HG00266 HG00267 HG00268 HG00269 HG00270 HG00271 HG00272 HG00273 HG00274 HG00275 HG00276 HG00277 HG00278 HG00280 HG00281 HG00282 HG00284 HG00285 HG00306 HG00309 HG00310 HG00311 HG00312 HG00313 HG00315 HG00318 HG00319 HG00320 HG00321 HG00323 HG00324 HG00325 HG00326 HG00327 HG00328 HG00329 HG00330 HG00331 HG00332 HG00334 HG00335 HG00336 HG00337 HG00338 HG00339 HG00341 HG00342 HG00343 HG00344 HG00345 HG00346 HG00349 HG00350 HG00351 HG00353 HG00355 HG00356 HG00357 HG00358 HG00359 HG00360 HG00361 HG00362 HG00364 HG00366 HG00367 HG00369 HG00372 HG00373 HG00375 HG00376 HG00377 HG00378 HG00381 HG00382 HG00383 HG00384 HG00403 HG00404 HG00406 HG00407 HG00418 HG00419 HG00421 HG00422 HG00427 HG00428 HG00436 HG00437 HG00442 HG00443 HG00445 HG00446 HG00448 HG00449 HG00451 HG00452 HG00457 HG00458 HG00463 HG00464 HG00472 HG00473 HG00475 HG00476 HG00478 HG00479 HG00500 HG00501 HG00512 HG00513 HG00524 HG00525 HG00530 HG00531 HG00533 HG00534 HG00536 HG00537 HG00542 HG00543 HG00553 HG00554 HG00556 HG00557 HG00559 HG00560 HG00565 HG00566 HG00577 HG00578 HG00580 HG00581 HG00583 HG00584 HG00589 HG00590 HG00592 HG00593 HG00595 HG00596 HG00607 HG00608 HG00610 HG00611 HG00613 HG00614 HG00619 HG00620 HG00625 HG00626 HG00628 HG00629 HG00634 HG00635 HG00637 HG00638 HG00640 HG00641 HG00650 HG00651 HG00653 HG00654 HG00656 HG00657 HG00662 HG00663 HG00671 HG00672 HG00683 HG00684 HG00689 HG00690 HG00692 HG00693 HG00698 HG00699 HG00701 HG00702 HG00704 HG00705 HG00707 HG00708 HG00731 HG00732 HG00734 HG00736 HG00737 HG00740 HG01047 HG01048 HG01051 HG01052 HG01055 HG01060 HG01061 HG01066 HG01067 HG01069 HG01070 HG01072 HG01073 HG01075 HG01079 HG01080 HG01082 HG01083 HG01085 HG01095 HG01097 HG01098 HG01101 HG01102 HG01104 HG01105 HG01107 HG01108 HG01112 HG01113 HG01124 HG01125 HG01133 HG01134 HG01136 HG01137 HG01140 HG01148 HG01149 HG01167 HG01168 HG01170 HG01171 HG01173 HG01174 HG01176 HG01183 HG01187 HG01188 HG01190 HG01191 HG01197 HG01198 HG01204 HG01250 HG01251 HG01257 HG01259 HG01271 HG01272 HG01274 HG01275 HG01277 HG01278 HG01334 HG01342 HG01344 HG01345 HG01350 HG01351 HG01353 HG01354 HG01356 HG01357 HG01359 HG01360 HG01365 HG01366 HG01374 HG01375 HG01377 HG01378 HG01383 HG01384 HG01389 HG01390 HG01437 HG01440 HG01441 HG01455 HG01456 HG01461 HG01462 HG01465 HG01488 HG01489 HG01491 HG01492 HG01494 HG01495 HG01497 HG01498 HG01515 HG01516 HG01518 HG01519 HG01521 HG01522 HG01550 HG01551 HG01617 HG01618 HG01619 HG01620 HG01623 HG01624 HG01625 HG01626 NA06984 NA06986 NA06989 NA06994 NA07000 NA07037 NA07048 NA07051 NA07056 NA07347 NA07357 NA10847 NA10851 NA11829 NA11830 NA11831 NA11843 NA11892 NA11893 NA11894 NA11919 NA11920 NA11930 NA11931 NA11932 NA11933 NA11992 NA11993 NA11994 NA11995 NA12003 NA12004 NA12006 NA12043 NA12044 NA12045 NA12046 NA12058 NA12144 NA12154 NA12155 NA12249 NA12272 NA12273 NA12275 NA12282 NA12283 NA12286 NA12287 NA12340 NA12341 NA12342 NA12347 NA12348 NA12383 NA12399 NA12400 NA12413 NA12489 NA12546 NA12716 NA12717 NA12718 NA12748 NA12749 NA12750 NA12751 NA12761 NA12763 NA12775 NA12777 NA12778 NA12812 NA12814 NA12815 NA12827 NA12829 NA12830 NA12842 NA12843 NA12872 NA12873 NA12874 NA12889 NA12890 NA18486 NA18487 NA18489 NA18498 NA18499 NA18501 NA18502 NA18504 NA18505 NA18507 NA18508 NA18510 NA18511 NA18516 NA18517 NA18519 NA18520 NA18522 NA18523 NA18525 NA18526 NA18527 NA18528 NA18530 NA18532 NA18534 NA18535 NA18536 NA18537 NA18538 NA18539 NA18541 NA18542 NA18543 NA18544 NA18545 NA18546 NA18547 NA18548 NA18549 NA18550 NA18552 NA18553 NA18555 NA18557 NA18558 NA18559 NA18560 NA18561 NA18562 NA18563 NA18564 NA18565 NA18566 NA18567 NA18570 NA18571 NA18572 NA18573 NA18574 NA18576 NA18577 NA18579 NA18582 NA18592 NA18593 NA18595 NA18596 NA18597 NA18599 NA18602 NA18603 NA18605 NA18606 NA18608 NA18609 NA18610 NA18611 NA18612 NA18613 NA18614 NA18615 NA18616 NA18617 NA18618 NA18619 NA18620 NA18621 NA18622 NA18623 NA18624 NA18626 NA18627 NA18628 NA18630 NA18631 NA18632 NA18633 NA18634 NA18635 NA18636 NA18637 NA18638 NA18639 NA18640 NA18641 NA18642 NA18643 NA18645 NA18647 NA18740 NA18745 NA18747 NA18748 NA18749 NA18757 NA18853 NA18856 NA18858 NA18861 NA18867 NA18868 NA18870 NA18871 NA18873 NA18874 NA18907 NA18908 NA18909 NA18910 NA18912 NA18916 NA18917 NA18923 NA18924 NA18933 NA18934 NA18939 NA18940 NA18941 NA18942 NA18943 NA18944 NA18945 NA18946 NA18947 NA18948 NA18949 NA18950 NA18951 NA18952 NA18953 NA18954 NA18956 NA18957 NA18959 NA18960 NA18961 NA18962 NA18963 NA18964 NA18965 NA18966 NA18968 NA18971 NA18973 NA18974 NA18975 NA18976 NA18977 NA18978 NA18980 NA18981 NA18982 NA18983 NA18984 NA18985 NA18986 NA18987 NA18988 NA18989 NA18990 NA18992 NA18994 NA18995 NA18998 NA18999 NA19000 NA19002 NA19003 NA19004 NA19005 NA19007 NA19009 NA19010 NA19012 NA19020 NA19028 NA19035 NA19036 NA19038 NA19041 NA19044 NA19046 NA19054 NA19055 NA19056 NA19057 NA19058 NA19059 NA19060 NA19062 NA19063 NA19064 NA19065 NA19066 NA19067 NA19068 NA19070 NA19072 NA19074 NA19075 NA19076 NA19077 NA19078 NA19079 NA19080 NA19081 NA19082 NA19083 NA19084 NA19085 NA19087 NA19088 NA19093 NA19095 NA19096 NA19098 NA19099 NA19102 NA19107 NA19108 NA19113 NA19114 NA19116 NA19117 NA19118 NA19119 NA19121 NA19129 NA19130 NA19131 NA19137 NA19138 NA19146 NA19147 NA19149 NA19150 NA19152 NA19160 NA19171 NA19172 NA19175 NA19185 NA19189 NA19190 NA19197 NA19198 NA19200 NA19204 NA19207 NA19209 NA19213 NA19222 NA19223 NA19225 NA19235 NA19236 NA19247 NA19248 NA19256 NA19257 NA19307 NA19308 NA19309 NA19310 NA19311 NA19312 NA19313 NA19315 NA19316 NA19317 NA19318 NA19319 NA19321 NA19324 NA19327 NA19328 NA19331 NA19332 NA19334 NA19338 NA19346 NA19347 NA19350 NA19351 NA19352 NA19355 NA19359 NA19360 NA19371 NA19372 NA19373 NA19374 NA19375 NA19376 NA19377 NA19379 NA19380 NA19381 NA19382 NA19383 NA19384 NA19385 NA19390 NA19391 NA19393 NA19394 NA19395 NA19396 NA19397 NA19398 NA19399 NA19401 NA19403 NA19404 NA19428 NA19429 NA19430 NA19431 NA19434 NA19435 NA19436 NA19437 NA19438 NA19439 NA19440 NA19443 NA19444 NA19445 NA19446 NA19448 NA19449 NA19451 NA19452 NA19453 NA19455 NA19456 NA19457 NA19461 NA19462 NA19463 NA19466 NA19467 NA19468 NA19469 NA19470 NA19471 NA19472 NA19473 NA19474 NA19625 NA19648 NA19651 NA19652 NA19654 NA19655 NA19657 NA19660 NA19661 NA19663 NA19664 NA19672 NA19675 NA19676 NA19678 NA19679 NA19681 NA19682 NA19684 NA19685 NA19700 NA19701 NA19703 NA19704 NA19707 NA19711 NA19712 NA19713 NA19716 NA19717 NA19719 NA19720 NA19722 NA19723 NA19725 NA19726 NA19728 NA19729 NA19731 NA19732 NA19734 NA19735 NA19737 NA19738 NA19740 NA19741 NA19746 NA19747 NA19749 NA19750 NA19752 NA19753 NA19755 NA19756 NA19758 NA19759 NA19761 NA19762 NA19764 NA19770 NA19771 NA19773 NA19774 NA19776 NA19777 NA19779 NA19780 NA19782 NA19783 NA19785 NA19786 NA19788 NA19789 NA19794 NA19795 NA19818 NA19819 NA19834 NA19835 NA19900 NA19901 NA19904 NA19908 NA19909 NA19914 NA19916 NA19917 NA19920 NA19921 NA19922 NA19923 NA19982 NA19984 NA19985 NA20126 NA20127 NA20276 NA20278 NA20281 NA20282 NA20287 NA20289 NA20291 NA20294 NA20296 NA20298 NA20299 NA20314 NA20317 NA20322 NA20332 NA20334 NA20336 NA20339 NA20340 NA20341 NA20342 NA20344 NA20346 NA20348 NA20351 NA20356 NA20357 NA20359 NA20363 NA20412 NA20414 NA20502 NA20503 NA20504 NA20505 NA20506 NA20507 NA20508 NA20509 NA20510 NA20512 NA20513 NA20515 NA20516 NA20517 NA20518 NA20519 NA20520 NA20521 NA20522 NA20524 NA20525 NA20527 NA20528 NA20529 NA20530 NA20531 NA20532 NA20533 NA20534 NA20535 NA20536 NA20537 NA20538 NA20539 NA20540 NA20541 NA20542 NA20543 NA20544 NA20581 NA20582 NA20585 NA20586 NA20588 NA20589 NA20752 NA20753 NA20754 NA20755 NA20756 NA20757 NA20758 NA20759 NA20760 NA20761 NA20765 NA20766 NA20768 NA20769 NA20770 NA20771 NA20772 NA20773 NA20774 NA20775 NA20778 NA20783 NA20785 NA20786 NA20787 NA20790 NA20792 NA20795 NA20796 NA20797 NA20798 NA20799 NA20800 NA20801 NA20802 NA20803 NA20804 NA20805 NA20806 NA20807 NA20808 NA20809 NA20810 NA20811 NA20812 NA20813 NA20814 NA20815 NA20816 NA20818 NA20819 NA20826 NA20828 +20 81006 rs140766395 T C 100 PASS AA=T;AC=0;AF=0.00000;AN=2184;AVGPOST=0.9995;DP=0;ERATE=0.0003;LDAF=0.0002;RSQ=0.0796;SNPSOURCE=LOWCOV;THETA=0.0005;VT=SNP GT:DS:GL 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.00,-3.66,-5.00 0|0:0.000:-0.00,-2.38,-5.00 0|0:0.000:-0.01,-1.73,-5.00 0|0:0.000:-0.04,-1.09,-5.00 0|0:0.000:-0.04,-1.06,-5.00 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.01,-1.86,-5.00 0|0:0.000:-0.02,-1.43,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.00,-3.04,-5.00 0|0:0.000:-0.00,-2.24,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.00,-2.15,-5.00 0|0:0.000:-0.00,-3.52,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.00,-1.99,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.00,-2.40,-5.00 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.00,-2.83,-5.00 0|0:0.000:-0.19,-0.47,-2.17 0|0:0.000:-0.01,-1.72,-5.00 0|0:0.000:-0.00,-3.08,-5.00 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.01,-1.52,-5.00 0|0:0.000:-0.02,-1.26,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:0.00,-5.00,-5.00 0|0:0.000:-0.00,-3.15,-5.00 0|0:0.000:-0.00,-2.36,-5.00 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.01,-1.71,-5.00 0|0:0.000:-0.04,-1.11,-5.00 0|0:0.000:-0.00,-3.03,-5.00 0|0:0.000:-0.02,-1.27,-5.00 0|0:0.000:-0.05,-0.96,-5.00 0|0:0.000:-0.10,-0.69,-4.22 0|0:0.000:-0.02,-1.38,-5.00 0|0:0.000:-0.10,-0.70,-3.92 0|0:0.000:-0.03,-1.11,-5.00 0|0:0.000:-0.18,-0.48,-2.44 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.101264,-0.682104,-4.22185 0|0:0.000:-0.00,-2.06,-5.00 0|0:0.000:-0.02,-1.27,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.09,-0.73,-4.70 0|0:0.000:-0.05,-0.96,-5.00 0|0:0.000:-0.00,-1.97,-5.00 0|0:0.000:-0.477139,-0.477113,-0.477113 0|0:0.000:-0.00,-2.65,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.00,-3.27,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.02,-1.26,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.17,-0.49,-2.16 0|0:0.000:-0.00,-2.71,-5.00 0|0:0.000:-0.05,-0.97,-5.00 0|0:0.000:-0.05,-0.97,-5.00 0|0:0.000:-0.18,-0.48,-2.57 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.18,-0.48,-2.41 0|0:0.000:-0.18,-0.48,-2.17 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.18,-0.48,-1.98 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.00,-2.63,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.00,-3.74,-5.00 0|0:0.000:-0.00,-4.00,-5.00 0|0:0.000:-0.02,-1.26,-5.00 0|0:0.000:-0.00,-2.10,-5.00 0|0:0.000:-0.01,-1.62,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.00,-2.35,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.0152657,-1.46168,-5 0|0:0.000:-0.10,-0.69,-3.92 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.15,-0.54,-2.85 0|0:0.000:-0.01,-1.72,-5.00 0|0:0.000:-0.10,-0.69,-4.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.00,-2.03,-5.00 0|0:0.000:-0.00,-4.40,-5.00 0|0:0.000:-0.00,-2.34,-5.00 0|0:0.000:-0.00,-2.28,-5.00 0|0:0.000:-0.00,-2.25,-5.00 0|0:0.000:-0.12,-0.61,-2.91 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.00,-2.34,-5.00 0|0:0.000:-0.00,-2.31,-5.00 0|0:0.000:-0.06,-0.91,-4.70 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-2.44,-5.00 0|0:0.000:-0.01,-1.92,-5.00 0|0:0.000:-0.19,-0.46,-2.48 0|0:0.000:-0.00,-2.49,-5.00 0|0:0.000:-0.01,-1.85,-5.00 0|0:0.000:-0.01,-1.75,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-3.85,-5.00 0|0:0.000:-0.09,-0.74,-4.70 0|0:0.000:-0.16,-0.51,-2.77 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.02,-1.36,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.00,-2.94,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.03,-1.23,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.00,-2.38,-5.00 0|0:0.000:-0.05,-0.98,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.06,-0.92,-4.70 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.01,-1.74,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.00,-2.39,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.11,-0.66,-4.10 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.01,-1.85,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.10,-0.67,-3.62 0|0:0.000:-0.10,-0.68,-3.74 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.00,-3.36,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.10,-0.68,-3.80 0|0:0.000:-0.02,-1.28,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.01,-1.87,-5.00 0|0:0.000:-0.02,-1.32,-5.00 0|0:0.000:-0.00,-2.60,-5.00 0|0:0.000:-0.01,-1.52,-5.00 0|0:0.000:-0.00,-2.68,-5.00 0|0:0.000:-0.33,-0.27,-5.00 0|0:0.000:-0.10,-0.69,-4.10 0|0:0.000:-0.00,-3.66,-5.00 0|0:0.000:-0.00,-2.12,-5.00 0|0:0.000:-0.00,-2.63,-5.00 0|0:0.000:-0.03,-1.14,-5.00 0|0:0.000:-0.00028673,-3.18046,-5 0|0:0.000:-0.00,-3.70,-5.00 0|0:0.000:-0.18,-0.48,-2.18 0|0:0.000:-0.01,-1.54,-5.00 0|0:0.000:-0.10,-0.68,-4.70 0|0:0.000:-0.10,-0.71,-4.10 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.00,-1.95,-5.00 0|0:0.000:-0.18,-0.48,-1.85 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.01,-1.68,-5.00 0|0:0.000:-0.00,-1.94,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.00,-3.28,-5.00 0|0:0.050:-0.17,-0.49,-2.07 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.06,-0.87,-5.00 0|0:0.000:-0.02,-1.35,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.10,-0.69,-4.00 0|0:0.000:-0.01,-1.79,-5.00 0|0:0.000:-0.00,-2.02,-5.00 0|0:0.000:-0.00,-3.21,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.18,-0.46,-2.72 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.04,-1.09,-5.00 0|0:0.000:-0.04,-1.09,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.11,-0.65,-3.92 0|0:0.000:-0.19,-0.45,-2.14 0|0:0.000:-0.11,-0.64,-4.00 0|0:0.000:-0.19,-0.47,-2.29 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-2.32,-5.00 0|0:0.000:-0.10,-0.69,-3.70 0|0:0.000:-0.01,-1.47,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.01,-1.51,-5.00 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.05,-0.96,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.01,-1.47,-5.00 0|0:0.000:-0.03,-1.25,-5.00 0|0:0.000:-0.10,-0.70,-4.22 0|0:0.000:-0.18,-0.48,-2.11 0|0:0.000:-0.01,-1.47,-5.00 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.150:-5.00,-2.47,-0.00 0|0:0.000:-0.01,-1.56,-5.00 0|0:0.000:-0.00,-2.18,-5.00 0|0:0.000:-0.00,-1.95,-5.00 0|0:0.000:-0.11,-0.66,-3.62 0|0:0.000:-0.00,-2.63,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.00,-3.25,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.10,-0.70,-4.70 0|0:0.000:-0.01,-1.53,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.02,-1.31,-5.00 0|0:0.000:-0.05,-0.96,-5.00 0|0:0.000:-0.01,-1.83,-5.00 0|0:0.000:-0.00,-2.10,-5.00 0|0:0.000:-0.18,-0.48,-2.43 0|0:0.000:-0.00,-2.42,-5.00 0|0:0.000:-0.02,-1.26,-5.00 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.050:-0.18,-0.48,-2.51 0|0:0.000:-0.01,-1.52,-5.00 0|0:0.000:-0.03,-1.23,-5.00 0|0:0.000:-0.01,-1.83,-5.00 0|0:0.000:-0.02,-1.26,-5.00 0|0:0.000:-0.01,-1.51,-5.00 0|0:0.000:-0.02,-1.26,-5.00 0|0:0.000:-0.01,-1.84,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.00,-2.62,-5.00 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.10,-0.70,-4.70 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.10,-0.68,-4.70 0|0:0.000:-0.00,-2.37,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.01,-1.56,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.00,-2.61,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.01,-1.51,-5.00 0|0:0.000:-0.01,-1.78,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.02,-1.47,-5.00 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.01,-1.67,-5.00 0|0:0.000:-0.00,-2.02,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.00,-2.86,-5.00 0|0:0.000:-0.19,-0.46,-1.93 0|0:0.000:-0.01,-1.67,-5.00 0|0:0.000:-0.00,-2.52,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.00,-4.00,-5.00 0|0:0.000:-0.00,-2.91,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.00,-3.85,-5.00 0|0:0.000:-0.01,-1.71,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-2.6068e-05,-4.22185,-5 0|0:0.000:-0.00,-2.68,-5.00 0|0:0.000:-0.00,-2.74,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.00,-2.04,-5.00 0|0:0.000:-0.01,-1.75,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.00,-2.86,-5.00 0|0:0.000:-0.00,-2.74,-5.00 0|0:0.000:-0.00,-4.40,-5.00 0|0:0.000:-0.10,-0.68,-4.70 0|0:0.000:-0.01,-1.92,-5.00 0|0:0.000:-0.00,-2.97,-5.00 0|0:0.000:-0.00,-4.22,-5.00 0|0:0.000:-0.05,-0.99,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.024798,-1.25571,-5 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.10,-0.68,-4.10 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.01,-1.78,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.00,-2.58,-5.00 0|0:0.000:-0.00800352,-1.7385,-5 0|0:0.000:-0.00,-2.87,-5.00 0|0:0.000:-0.00,-3.30,-5.00 0|0:0.000:-0.00,-2.88,-5.00 0|0:0.000:-0.00787083,-1.74569,-5 0|0:0.000:-0.0568618,-0.911085,-5 0|0:0.000:-0.0291698,-1.18735,-5 0|0:0.000:-0.000295433,-3.16749,-5 0|0:0.000:-0.00,-2.66,-5.00 0|0:0.000:-0.00,-2.37,-5.00 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.00,-2.21,-5.00 0|0:0.000:-0.00,-3.92,-5.00 0|0:0.000:-0.04,-1.08,-5.00 0|0:0.000:-0.00,-2.87,-5.00 0|0:0.000:-0.00,-3.44,-5.00 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.01,-1.81,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.00,-3.42,-5.00 0|0:0.000:-0.00,-4.22,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.00,-1.95,-5.00 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.01,-1.52,-5.00 0|0:0.000:-0.01,-1.78,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.01,-1.81,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.00,-3.04,-5.00 0|0:0.000:-0.03,-1.25,-5.00 0|0:0.000:-0.00,-3.05,-5.00 0|0:0.000:-0.00,-2.43,-5.00 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.01,-1.79,-5.00 0|0:0.000:-0.00,-2.24,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.00,-3.08,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.18,-0.48,-2.43 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.02,-1.42,-5.00 0|0:0.000:-0.00,-2.34,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.11,-0.66,-4.40 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.050:-0.05,-0.93,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.00,-2.29,-5.00 0|0:0.000:-0.01,-1.79,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.013282,-1.52115,-5 0|0:0.000:-0.0980515,-0.69452,-4.39794 0|0:0.000:-0.0154637,-1.45618,-5 0|0:0.000:-0.00693429,-1.80024,-5 0|0:0.000:-0.000443213,-2.9914,-5 0|0:0.000:-0.0151488,-1.46496,-5 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.0058589,-1.8729,-5 0|0:0.000:-0.000312815,-3.14267,-5 0|0:0.000:-0.0100965,-1.63865,-5 0|0:0.000:-0.00330448,-2.12033,-5 0|0:0.000:-0.00561244,-1.89144,-5 0|0:0.000:-0.00639626,-1.83505,-5 0|0:0.000:-0.0113875,-1.58704,-5 0|0:0.000:-0.00412797,-2.02411,-5 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.01,-1.74,-5.00 0|0:0.000:-0.01,-1.73,-5.00 0|0:0.000:-0.11,-0.66,-3.44 0|0:0.000:-0.17,-0.50,-2.19 0|0:0.000:-0.01,-1.72,-5.00 0|0:0.000:-0.04,-1.08,-5.00 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.00,-1.96,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.01,-1.50,-5.00 0|0:0.000:-0.00,-2.81,-5.00 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.00,-2.35,-5.00 0|0:0.000:-0.01,-1.75,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.18,-0.48,-1.90 0|0:0.000:-0.02,-1.43,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.03,-1.23,-5.00 0|0:0.000:-0.06,-0.88,-4.70 0|0:0.000:-0.11,-0.64,-4.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.00,-4.10,-5.00 0|0:0.000:-0.07,-0.82,-3.59 0|0:0.000:-0.01,-1.60,-5.00 0|0:0.000:-0.02,-1.44,-5.00 0|0:0.000:-0.18,-0.48,-2.08 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.06,-0.86,-3.92 0|0:0.000:-0.01,-1.69,-5.00 0|0:0.000:-0.100376,-0.685669,-3.85387 0|0:0.000:-0.00,-3.13,-5.00 0|0:0.000:-0.05,-0.97,-5.00 0|0:0.000:-0.03,-1.13,-5.00 0|0:0.000:-0.01,-1.91,-5.00 0|0:0.000:-0.10,-0.68,-4.40 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.18,-0.46,-2.96 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.0337832,-1.12587,-5 0|0:0.000:-0.19,-0.46,-2.05 0|0:0.000:-0.06,-0.86,-5.00 0|0:0.000:-0.10,-0.68,-4.40 0|0:0.000:-0.01,-1.73,-5.00 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:0.00,-5.00,-5.00 0|0:0.000:-0.18,-0.47,-2.22 0|0:0.000:-0.18,-0.47,-2.19 0|0:0.000:-0.01,-1.47,-5.00 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.103397,-0.674033,-4.39794 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.17,-0.49,-2.09 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.00,-2.51,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.06,-0.86,-5.00 0|0:0.000:-0.05,-0.93,-4.70 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.01,-1.93,-5.00 0|0:0.000:-0.01,-1.89,-5.00 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.10,-0.67,-4.10 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.00,-2.40,-5.00 0|0:0.000:-0.00,-3.40,-5.00 0|0:0.000:-0.01,-1.73,-5.00 0|0:0.000:-0.11,-0.66,-4.40 0|0:0.000:-0.0614303,-0.879755,-5 0|0:0.000:-0.06,-0.89,-5.00 0|0:0.000:-0.19,-0.46,-2.48 0|0:0.000:-0.18,-0.47,-2.49 0|0:0.000:-0.01,-1.67,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.02,-1.30,-5.00 0|0:0.000:-0.19,-0.46,-2.18 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.18,-0.47,-2.26 0|0:0.000:-0.19,-0.46,-2.44 0|0:0.000:-0.03,-1.13,-5.00 0|0:0.000:-0.11,-0.66,-4.22 0|0:0.000:-0.10,-0.68,-3.66 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.09,-0.73,-2.76 0|0:0.000:-0.02,-1.37,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.00,-2.56,-5.00 0|0:0.000:-0.10,-0.68,-4.40 0|0:0.000:-0.19,-0.47,-2.13 0|0:0.000:-0.04,-1.08,-5.00 0|0:0.000:-0.18,-0.48,-1.92 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.19,-0.47,-2.20 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.10,-0.68,-3.80 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:-0.01,-1.79,-5.00 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.01,-1.83,-5.00 0|0:0.000:-0.00,-2.60,-5.00 0|0:0.000:-0.18,-0.47,-2.34 0|0:0.000:-0.03,-1.14,-5.00 0|0:0.000:-0.19,-0.46,-2.80 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.00,-1.99,-5.00 0|0:0.000:-0.000721523,-2.77989,-5 0|0:0.000:-0.0544526,-0.928707,-5 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.10,-0.67,-4.70 0|0:0.050:-0.03,-1.17,-5.00 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.01,-1.83,-5.00 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.10,-0.68,-4.10 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.00,-4.00,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.18,-0.48,-2.27 0|0:0.000:-0.18,-0.48,-1.93 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.00,-2.30,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.10,-0.69,-3.66 0|0:0.000:-0.11,-0.64,-3.09 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-2.56,-5.00 0|0:0.000:-0.02,-1.44,-5.00 0|0:0.000:-0.00,-3.70,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.18,-0.48,-1.90 0|0:0.000:-0.10,-0.67,-4.00 0|0:0.000:-0.10,-0.67,-4.70 0|0:0.000:-0.05,-0.92,-5.00 0|0:0.000:-0.00,-2.28,-5.00 0|0:0.000:-0.11,-0.65,-3.32 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.19,-0.46,-2.12 0|0:0.000:-0.18,-0.47,-2.29 0|0:0.000:-0.18,-0.47,-2.27 0|0:0.000:-0.01,-1.47,-5.00 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.10,-0.69,-3.70 0|0:0.000:-0.10,-0.68,-4.22 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.10,-0.68,-4.22 0|0:0.000:-0.19,-0.46,-2.65 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.00263983,-2.21753,-5 0|0:0.000:-0.01,-1.65,-5.00 0|0:0.000:-0.00,-3.92,-5.00 0|0:0.000:-0.00,-2.44,-5.00 0|0:0.000:-0.02,-1.40,-5.00 0|0:0.000:-0.23,-0.46,-1.24 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.18,-0.48,-1.90 0|0:0.000:-0.10,-0.67,-3.70 0|0:0.000:-0.18,-0.47,-2.07 0|0:0.000:-0.02,-1.47,-5.00 0|0:0.000:-0.10,-0.68,-4.10 0|0:0.000:-0.03,-1.12,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.10,-0.69,-4.10 0|0:0.000:-0.01,-1.93,-5.00 0|0:0.000:-0.00,-2.28,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.01,-1.57,-5.00 0|0:0.000:-0.10,-0.68,-4.40 0|0:0.000:-0.18,-0.47,-2.08 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.02,-1.38,-5.00 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.00,-2.10,-5.00 0|0:0.000:-0.02,-1.37,-5.00 0|0:0.000:-0.01,-1.59,-5.00 0|0:0.000:-0.00,-2.13,-5.00 0|0:0.000:-0.00333949,-2.11577,-5 0|0:0.000:-0.00,-2.91,-5.00 0|0:0.000:-0.01,-1.93,-5.00 0|0:0.000:-0.09,-0.74,-4.40 0|0:0.000:-0.00,-3.42,-5.00 0|0:0.000:-0.00,-2.91,-5.00 0|0:0.000:-0.00,-4.70,-5.00 0|0:0.000:-0.10,-0.68,-5.00 0|0:0.000:-0.05,-0.95,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.02,-1.37,-5.00 0|0:0.000:-0.00,-2.89,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.01,-1.69,-5.00 0|0:0.000:-0.29,-0.39,-1.08 0|0:0.000:-0.20,-0.45,-2.17 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-2.23,-5.00 0|0:0.000:-0.19,-0.49,-1.47 0|0:0.000:-0.10,-0.67,-3.92 0|0:0.000:-0.18,-0.47,-2.84 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.00,-3.18,-5.00 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.00,-2.36,-5.00 0|0:0.000:-0.01,-1.57,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.11,-0.66,-3.70 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.00,-2.25,-5.00 0|0:0.000:-0.00,-2.22,-5.00 0|0:0.000:-0.10,-0.69,-4.10 0|0:0.000:-0.19,-0.46,-2.09 0|0:0.000:-0.01,-1.83,-5.00 0|0:0.000:-0.00,-2.12,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.03,-1.13,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.01,-1.61,-5.00 0|0:0.000:-0.00,-2.61,-5.00 0|0:0.000:-0.01,-1.58,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.11,-0.65,-3.38 0|0:0.000:-0.00,-2.04,-5.00 0|0:0.000:-0.00,-2.91,-5.00 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.02,-1.43,-5.00 0|0:0.000:-0.07,-0.84,-5.00 0|0:0.000:-0.00,-2.41,-5.00 0|0:0.000:-0.07,-0.85,-5.00 0|0:0.000:-0.11,-0.64,-3.25 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.02,-1.32,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.20,-0.46,-1.75 0|0:0.000:-0.182448,-0.46824,-2.55284 0|0:0.000:-0.18,-0.47,-2.33 0|0:0.000:-0.03,-1.12,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.00,-2.47,-5.00 0|0:0.000:-0.00,-3.66,-5.00 0|0:0.000:-0.477139,-0.477113,-0.477113 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.00,-2.87,-5.00 0|0:0.000:-0.00,-2.27,-5.00 0|0:0.000:-0.01,-1.66,-5.00 0|0:0.000:-0.19,-0.46,-2.44 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.01,-1.83,-5.00 0|0:0.000:-0.00,-3.19,-5.00 0|0:0.000:-0.04,-1.09,-5.00 0|0:0.000:-0.00,-2.64,-5.00 0|0:0.000:-0.10,-0.67,-4.22 0|0:0.000:-0.00,-4.40,-5.00 0|0:0.000:-0.18,-0.47,-2.48 0|0:0.000:-0.10,-0.68,-3.92 0|0:0.000:-0.06,-0.88,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.07,-0.80,-4.70 0|0:0.000:-0.01,-1.68,-5.00 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:-0.00,-2.61,-5.00 0|0:0.000:-0.00,-3.17,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.02,-1.29,-5.00 0|0:0.000:-0.00,-3.66,-5.00 0|0:0.000:-0.01,-1.63,-5.00 0|0:0.000:-0.02,-1.31,-5.00 0|0:0.000:-0.00,-2.62,-5.00 0|0:0.000:-0.00,-3.62,-5.00 0|0:0.050:-0.06,-0.89,-5.00 0|0:0.000:-0.01,-1.93,-5.00 0|0:0.000:-0.02,-1.38,-5.00 0|0:0.000:-0.10,-0.68,-4.22 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.11,-0.66,-4.22 0|0:0.050:-0.03,-1.15,-5.00 0|0:0.000:-0.06,-0.89,-5.00 0|0:0.000:-0.00,-2.73,-5.00 0|0:0.000:-0.00,-4.70,-5.00 0|0:0.000:-0.00,-2.02,-5.00 0|0:0.000:-0.00,-2.73,-5.00 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.00,-2.56,-5.00 0|0:0.000:-0.00,-3.17,-5.00 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.01,-1.73,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.00,-1.99,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.18,-0.47,-2.15 0|0:0.000:-0.00,-1.99,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.06,-0.89,-5.00 0|0:0.000:-0.00,-2.39,-5.00 0|0:0.000:-0.00,-2.91,-5.00 0|0:0.000:-0.01,-1.93,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.11,-0.67,-3.70 0|0:0.000:-0.18,-0.47,-2.25 0|0:0.000:-0.00,-2.21,-5.00 0|0:0.000:-0.11,-0.64,-3.92 0|0:0.000:-0.10,-0.68,-4.10 0|0:0.000:-0.10,-0.67,-4.00 0|0:0.000:-0.11,-0.66,-3.66 0|0:0.000:-0.19,-0.47,-2.01 0|0:0.000:-0.07,-0.84,-3.92 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.02,-1.40,-5.00 0|0:0.000:-0.03,-1.14,-5.00 0|0:0.000:-0.01,-1.61,-5.00 0|0:0.000:-0.11,-0.66,-4.40 0|0:0.000:-0.11,-0.65,-4.00 0|0:0.000:-0.00,-3.66,-5.00 0|0:0.000:-0.19,-0.46,-2.06 0|0:0.000:-0.00,-3.40,-5.00 0|0:0.000:-0.00,-2.62,-5.00 0|0:0.000:-0.00,-2.15,-5.00 0|0:0.000:-0.00,-2.36,-5.00 0|0:0.000:-0.03,-1.12,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.00,-2.32,-5.00 0|0:0.000:-0.01,-1.81,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.00,-2.30,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.12,-0.61,-2.62 0|0:0.000:-0.01,-1.85,-5.00 0|0:0.000:-0.00,-3.52,-5.00 0|0:0.000:-0.07,-0.85,-4.70 0|0:0.000:-0.00,-3.02,-5.00 0|0:0.000:-0.01,-1.91,-5.00 0|0:0.000:-0.01,-1.74,-5.00 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.01,-1.72,-5.00 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-1.73698e-05,-4.39794,-5 0|0:0.000:-0.000530157,-2.91364,-5 0|0:0.000:-0.00,-2.65,-5.00 0|0:0.000:-0.01,-1.81,-5.00 0|0:0.000:-0.00,-2.03,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.18,-0.47,-2.29 0|0:0.000:-0.18,-0.47,-2.32 0|0:0.000:-0.01,-1.72,-5.00 0|0:0.000:-0.01,-1.74,-5.00 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.00,-2.32,-5.00 0|0:0.000:-0.00,-2.65,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.00062583,-2.84164,-5 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.00,-2.29,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.00,-2.27,-5.00 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.03,-1.13,-5.00 0|0:0.000:-0.00,-2.42,-5.00 0|0:0.000:-0.00,-3.13,-5.00 0|0:0.000:-0.00,-2.64,-5.00 0|0:0.000:-0.00,-2.66,-5.00 0|0:0.000:-0.000295433,-3.16749,-5 0|0:0.000:-0.01,-1.78,-5.00 0|0:0.000:-0.00,-2.31,-5.00 0|0:0.000:-0.00,-2.11,-5.00 0|0:0.000:-0.00,-2.88,-5.00 0|0:0.000:-0.05,-0.93,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.02,-1.29,-5.00 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.00,-2.68,-5.00 0|0:0.000:-0.00,-2.20,-5.00 0|0:0.000:-0.00351459,-2.09366,-5 0|0:0.000:-0.01,-1.70,-5.00 0|0:0.000:-0.00,-3.34,-5.00 0|0:0.000:-0.00,-2.59,-5.00 0|0:0.000:-0.00,-2.52,-5.00 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.00,-2.39,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.00,-3.85,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.00,-2.04,-5.00 0|0:0.000:-0.00,-2.52,-5.00 0|0:0.000:-0.00,-2.34,-5.00 0|0:0.000:-0.06,-0.88,-5.00 0|0:0.000:-0.00,-2.12,-5.00 0|0:0.000:-0.00196746,-2.34486,-5 0|0:0.000:-0.00,-2.88,-5.00 0|0:0.000:-0.10,-0.68,-4.70 0|0:0.000:-0.01,-1.57,-5.00 0|0:0.000:-0.17,-0.49,-2.28 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.02,-1.44,-5.00 0|0:0.000:-0.0040403,-2.03339,-5 0|0:0.000:-0.05,-0.99,-5.00 0|0:0.000:-0.11,-0.66,-3.15 0|0:0.000:-0.03,-1.22,-5.00 0|0:0.000:-0.00250877,-2.23958,-5 0|0:0.000:-0.0609303,-0.88306,-5 0|0:0.000:-0.11,-0.66,-3.21 0|0:0.000:-0.00,-2.18,-5.00 0|0:0.000:-0.00,-2.11,-5.00 0|0:0.000:-0.000260646,-3.22185,-5 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.18151,-0.475007,-2.17783 0|0:0.000:-0.18,-0.48,-2.26 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.02,-1.40,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.01,-1.80,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-2.65,-5.00 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.18,-0.47,-2.47 0|0:0.000:-0.07,-0.84,-4.40 0|0:0.000:-0.00,-3.02,-5.00 0|0:0.000:-0.18,-0.47,-2.30 0|0:0.000:-0.06,-0.89,-5.00 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.19,-0.46,-2.04 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.00,-2.01,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.00,-1.98,-5.00 0|0:0.000:-0.02,-1.47,-5.00 0|0:0.000:-0.00,-2.25,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.00,-2.60,-5.00 0|0:0.000:-0.00,-4.22,-5.00 0|0:0.000:-0.00,-2.01,-5.00 0|0:0.000:-0.02,-1.44,-5.00 0|0:0.000:-0.00,-2.34,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.00,-2.57,-5.00 0|0:0.000:-0.01,-1.74,-5.00 0|0:0.000:-0.00,-2.81,-5.00 0|0:0.000:-0.00,-2.27,-5.00 0|0:0.000:-0.00,-2.30,-5.00 0|0:0.000:-0.00,-2.27,-5.00 0|0:0.000:-0.00,-3.70,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.01,-1.71,-5.00 0|0:0.000:-0.02,-1.43,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.00,-2.57,-5.00 0|0:0.000:-0.01,-1.75,-5.00 0|0:0.000:-0.00,-2.89,-5.00 0|0:0.000:-0.03,-1.13,-5.00 0|0:0.000:-0.01,-1.73,-5.00 0|0:0.000:-0.00,-2.56,-5.00 0|0:0.000:-0.01,-1.69,-5.00 0|0:0.000:-0.01,-1.69,-5.00 0|0:0.000:-0.00,-2.01,-5.00 0|0:0.000:-0.00,-3.15,-5.00 0|0:0.000:-0.00,-2.80,-5.00 0|0:0.000:-0.00,-3.92,-5.00 0|0:0.000:-0.10,-0.70,-4.70 0|0:0.000:-0.03,-1.20,-5.00 0|0:0.000:-0.00,-4.10,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.00,-2.33,-5.00 0|0:0.000:-0.00,-2.84,-5.00 0|0:0.000:-0.00,-2.08,-5.00 0|0:0.000:-0.02,-1.33,-5.00 0|0:0.000:-0.00,-3.70,-5.00 0|0:0.000:-0.00,-4.22,-5.00 0|0:0.000:-0.00,-2.03,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.00,-3.36,-5.00 0|0:0.000:-0.0100699,-1.63979,-5 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.03,-1.19,-5.00 0|0:0.000:-0.18,-0.48,-2.46 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.00,-2.66,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.00,-2.59,-5.00 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.10,-0.70,-4.40 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.00,-2.61,-5.00 0|0:0.000:-0.01,-1.65,-5.00 0|0:0.000:-0.00,-3.14,-5.00 0|0:0.000:0.00,-5.00,-5.00 0|0:0.000:-0.00,-2.59,-5.00 0|0:0.000:-0.00,-2.28,-5.00 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.10,-0.69,-4.70 0|0:0.000:-0.00,-3.13,-5.00 0|0:0.000:-0.00,-4.70,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.01,-1.84,-5.00 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.06,-0.89,-5.00 0|0:0.000:-0.02,-1.40,-5.00 0|0:0.000:-0.02,-1.42,-5.00 0|0:0.000:-0.0966495,-0.700014,-5 0|0:0.000:-0.00,-3.85,-5.00 0|0:0.000:-0.00,-2.21,-5.00 0|0:0.000:-0.16,-0.51,-2.82 0|0:0.000:-0.01,-1.53,-5.00 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.00,-2.85,-5.00 0|0:0.000:-0.00,-2.81,-5.00 0|0:0.000:-0.00,-3.85,-5.00 0|0:0.000:0.00,-5.00,-5.00 0|0:0.000:-0.000165054,-3.42022,-5 0|0:0.000:-0.01,-1.56,-5.00 0|0:0.000:0.00,-5.00,-5.00 0|0:0.000:-0.01,-1.55,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.01,-1.77,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.18,-0.48,-2.33 0|0:0.000:-0.00,-2.33,-5.00 0|0:0.000:-0.01,-1.49,-5.00 0|0:0.000:-0.02,-1.46,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.00,-2.09,-5.00 0|0:0.000:-0.00,-2.94,-5.00 0|0:0.000:-0.00,-2.07,-5.00 0|0:0.000:-0.01,-1.84,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.01,-1.78,-5.00 0|0:0.000:-0.01,-1.78,-5.00 0|0:0.000:-0.00,-3.38,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.00,-2.92,-5.00 0|0:0.000:-0.01,-1.48,-5.00 0|0:0.000:-0.00,-2.05,-5.00 0|0:0.000:-0.01,-1.76,-5.00 0|0:0.000:-0.00,-3.62,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.00314699,-2.14146,-5 0|0:0.000:-0.04,-1.09,-5.00 0|0:0.000:-0.01,-1.56,-5.00 0|0:0.000:-0.00,-3.32,-5.00 0|0:0.000:-0.00,-2.75,-5.00 0|0:0.000:-0.02,-1.38,-5.00 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.10,-0.68,-4.22 0|0:0.000:-0.10,-0.69,-4.22 0|0:0.050:-0.04,-1.10,-5.00 0|0:0.000:-0.03,-1.23,-5.00 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.09,-0.71,-4.70 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.13,-0.59,-2.11 0|0:0.000:-0.10,-0.69,-4.10 0|0:0.000:-0.18,-0.48,-2.32 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.17,-0.49,-2.31 0|0:0.000:-0.18,-0.47,-2.30 0|0:0.000:-0.02,-1.33,-5.00 0|0:0.000:-0.20,-0.44,-2.16 0|0:0.000:-0.18,-0.47,-2.28 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.22,-0.46,-1.28 0|0:0.000:-0.00,-3.17,-5.00 0|0:0.000:-0.10,-0.69,-4.22 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.02,-1.43,-5.00 0|0:0.000:-0.10,-0.69,-4.22 0|0:0.000:-0.03,-1.21,-5.00 0|0:0.000:-0.18,-0.47,-2.38 0|0:0.000:-0.00,-1.96,-5.00 0|0:0.000:-0.01,-1.47,-5.00 0|0:0.000:-0.01,-1.57,-5.00 0|0:0.000:-0.00,-2.22,-5.00 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:-0.18,-0.48,-2.25 0|0:0.000:-0.03,-1.14,-5.00 0|0:0.000:-0.10,-0.69,-4.40 0|0:0.000:-0.10,-0.68,-4.00 0|0:0.000:-0.18,-0.47,-2.14 0|0:0.000:-0.11,-0.66,-4.40 0|0:0.000:-0.19,-0.46,-2.74 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.10,-0.67,-4.70 0|0:0.000:-0.02,-1.45,-5.00 0|0:0.000:-0.10,-0.67,-5.00 0|0:0.000:-0.19,-0.46,-2.74 0|0:0.000:-0.05,-0.94,-5.00 0|0:0.000:-0.18,-0.47,-2.22 0|0:0.000:-0.18,-0.48,-2.19 0|0:0.000:-0.06,-0.87,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.18,-0.47,-2.41 0|0:0.000:-0.18,-0.47,-2.38 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.06,-0.92,-5.00 0|0:0.000:-0.03,-1.18,-5.00 0|0:0.000:-0.18,-0.47,-2.26 0|0:0.000:-0.18,-0.47,-2.31 0|0:0.000:-0.06,-0.89,-5.00 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.03,-1.16,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.10,-0.68,-4.22 0|0:0.000:-0.19,-0.46,-2.42 0|0:0.000:-0.02,-1.44,-5.00 0|0:0.000:-0.12,-0.61,-3.66 0|0:0.000:-0.11,-0.66,-4.70 0|0:0.000:-0.02,-1.41,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.03,-1.17,-5.00 0|0:0.000:-0.11,-0.66,-4.40 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.00414553,-2.02228,-5 0|0:0.000:-0.02,-1.44,-5.00 0|0:0.000:-0.06,-0.91,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.00,-3.27,-5.00 0|0:0.000:-0.03,-1.13,-5.00 0|0:0.000:-0.10,-0.67,-4.70 0|0:0.000:-0.03,-1.15,-5.00 0|0:0.000:-0.06,-0.87,-5.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.48,-0.48,-0.48 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.11,-0.65,-4.70 0|0:0.000:-0.18,-0.47,-2.20 0|0:0.000:-0.18,-0.47,-2.27 0|0:0.000:-0.11,-0.66,-4.00 0|0:0.000:-0.06,-0.90,-5.00 0|0:0.000:-0.29,-0.40,-1.06 0|0:0.000:-0.00,-2.00,-5.00 0|0:0.000:-0.0337268,-1.12656,-5 0|0:0.000:-0.02,-1.47,-5.00 0|0:0.000:-0.10,-0.67,-3.92 0|0:0.000:-0.18,-0.47,-2.47 0|0:0.000:-0.18,-0.47,-2.86 diff --git a/public/testdata/exampleGRP.grp b/public/testdata/exampleGRP.grp new file mode 100644 index 000000000..67a39dc3a --- /dev/null +++ b/public/testdata/exampleGRP.grp @@ -0,0 +1,1518 @@ +#:GATKReport.v1.0:5 +#:GATKTable:true:1:14::; +#:GATKTable:Arguments:Recalibration argument collection values used in this run +Argument Value +covariate null +default_platform null +deletions_context_size 8 +force_platform null +insertions_context_size 8 +insertions_default_quality 45 +low_quality_tail 2 +mismatches_context_size 2 +mismatches_default_quality -1 +quantizing_levels 16 +run_without_dbsnp false +solid_nocall_strategy THROW_EXCEPTION +solid_recal_mode SET_Q_ZERO +standard_covs true + +#:GATKTable:true:2:94:::; +#:GATKTable:Quantized:Quality quantization map +QualityScore Count QuantizedScore +0 20 3 +1 0 3 +2 6 3 +3 1041 3 +4 8 3 +5 190 3 +6 102 3 +7 28 7 +8 795 8 +9 0 93 +10 0 93 +11 0 93 +12 0 93 +13 0 93 +14 0 93 +15 0 93 +16 0 93 +17 0 93 +18 0 93 +19 0 93 +20 0 93 +21 0 93 +22 0 93 +23 0 93 +24 0 93 +25 0 93 +26 0 93 +27 0 93 +28 0 93 +29 0 93 +30 0 93 +31 0 93 +32 0 93 +33 0 93 +34 0 93 +35 0 93 +36 0 93 +37 0 93 +38 0 93 +39 0 93 +40 0 93 +41 0 93 +42 0 93 +43 0 93 +44 0 93 +45 0 93 +46 0 93 +47 0 93 +48 0 93 +49 0 93 +50 0 93 +51 0 93 +52 0 93 +53 0 93 +54 0 93 +55 0 93 +56 0 93 +57 0 93 +58 0 93 +59 0 93 +60 0 93 +61 0 93 +62 0 93 +63 0 93 +64 0 93 +65 0 93 +66 0 93 +67 0 93 +68 0 93 +69 0 93 +70 0 93 +71 0 93 +72 0 93 +73 0 93 +74 0 93 +75 0 93 +76 0 93 +77 0 93 +78 0 93 +79 0 93 +80 0 93 +81 0 93 +82 0 82 +83 0 83 +84 0 84 +85 0 85 +86 0 86 +87 0 87 +88 0 88 +89 0 89 +90 0 90 +91 0 91 +92 0 92 +93 0 93 + +#:GATKTable:false:6:3:%s:%s:%.4f:%.4f:%d:%d:; +#:GATKTable:RecalTable0: +ReadGroup EventType EmpiricalQuality EstimatedQReported Observations Errors +exampleBAM.bam.bam D 25.8092 45.0000 380 0 +exampleBAM.bam.bam M 14.0483 15.4820 380 14 +exampleBAM.bam.bam I 25.8092 45.0000 380 0 + +#:GATKTable:false:6:32:%s:%s:%s:%.4f:%d:%d:; +#:GATKTable:RecalTable1: +ReadGroup QualityScore EventType EmpiricalQuality Observations Errors +exampleBAM.bam.bam 32 M 15.1851 32 0 +exampleBAM.bam.bam 19 M 9.0309 15 1 +exampleBAM.bam.bam 33 M 15.5630 35 0 +exampleBAM.bam.bam 18 M 6.0206 7 1 +exampleBAM.bam.bam 34 M 15.6820 36 0 +exampleBAM.bam.bam 17 M 5.4407 6 1 +exampleBAM.bam.bam 16 M 7.4036 10 1 +exampleBAM.bam.bam 23 M 12.0412 15 0 +exampleBAM.bam.bam 6 M 4.7712 11 3 +exampleBAM.bam.bam 45 I 25.8092 380 0 +exampleBAM.bam.bam 22 M 10.0000 9 0 +exampleBAM.bam.bam 4 M 4.7712 5 1 +exampleBAM.bam.bam 21 M 12.5527 17 0 +exampleBAM.bam.bam 5 M 4.2597 7 2 +exampleBAM.bam.bam 20 M 4.7712 5 1 +exampleBAM.bam.bam 27 M 13.6173 22 0 +exampleBAM.bam.bam 10 M 3.0103 1 0 +exampleBAM.bam.bam 26 M 8.4510 6 0 +exampleBAM.bam.bam 11 M 1.7609 2 1 +exampleBAM.bam.bam 8 M 6.0206 7 1 +exampleBAM.bam.bam 25 M 12.0412 15 0 +exampleBAM.bam.bam 9 M 6.9897 4 0 +exampleBAM.bam.bam 24 M 10.2119 20 1 +exampleBAM.bam.bam 31 M 14.1497 25 0 +exampleBAM.bam.bam 14 M 3.0103 1 0 +exampleBAM.bam.bam 30 M 13.2222 20 0 +exampleBAM.bam.bam 15 M 7.7815 5 0 +exampleBAM.bam.bam 12 M 6.9897 4 0 +exampleBAM.bam.bam 29 M 13.2222 20 0 +exampleBAM.bam.bam 45 D 25.8092 380 0 +exampleBAM.bam.bam 13 M 6.0206 3 0 +exampleBAM.bam.bam 28 M 12.0412 15 0 + +#:GATKTable:false:8:1354:%s:%s:%s:%s:%s:%.4f:%d:%d:; +#:GATKTable:RecalTable2: +ReadGroup QualityScore CovariateValue CovariateName EventType EmpiricalQuality Observations Errors +exampleBAM.bam.bam 45 TGAAAGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGGTATTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGCCTCGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTGTGTCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTGTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTTAAGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTATTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 23 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 27 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 ATTCTATT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTAATCTC Context I 3.0103 1 0 +exampleBAM.bam.bam 34 GC Context M 4.7712 2 0 +exampleBAM.bam.bam 8 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 45 TAGAGTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 9 TA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTCGGG Context I 6.0206 3 0 +exampleBAM.bam.bam 45 AGTTTCAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CATTTCAC Context I 3.0103 1 0 +exampleBAM.bam.bam 16 7 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 5 76 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CATGATAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 53 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 57 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 25 52 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGGCAGCC Context D 3.0103 1 0 +exampleBAM.bam.bam 33 CT Context M 8.4510 6 0 +exampleBAM.bam.bam 45 AAGTGACA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGTGACAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGAGTTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTCTTTGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCCTGAAA Context D 3.0103 1 0 +exampleBAM.bam.bam 12 25 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 75 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 41 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 21 GG Context M 4.7712 2 0 +exampleBAM.bam.bam 26 50 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ACCTGGAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CACAGCAA Context D 3.0103 1 0 +exampleBAM.bam.bam 20 GA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 AGGTGGAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCAAAATC Context I 3.0103 1 0 +exampleBAM.bam.bam 27 TA Context M 6.9897 4 0 +exampleBAM.bam.bam 27 18 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 AAAATCTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 22 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 26 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 33 76 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 24 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTATTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTCAATGT Context I 3.0103 1 0 +exampleBAM.bam.bam 21 73 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 17 4 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 8 17 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 GA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 ATCGTGAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCAGATCC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GATCGTGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 52 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 56 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 9 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 23 CT Context M 4.7712 2 0 +exampleBAM.bam.bam 31 26 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 45 ATGTGAAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATTACTCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACACAGCA Context D 3.0103 1 0 +exampleBAM.bam.bam 26 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GGGTTTGG Context D 4.7712 2 0 +exampleBAM.bam.bam 33 8 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 GT Context M 4.7712 2 0 +exampleBAM.bam.bam 34 74 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATTCTTAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GAGCCTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 20 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTAGGG Context D 4.7712 2 0 +exampleBAM.bam.bam 33 42 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTGCAAAG Context I 3.0103 1 0 +exampleBAM.bam.bam 6 75 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 32 CA Context M 4.7712 2 0 +exampleBAM.bam.bam 29 60 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 13 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 GT Context M 4.7712 2 0 +exampleBAM.bam.bam 21 74 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTAATGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TATTATTG Context D 3.0103 1 0 +exampleBAM.bam.bam 24 52 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTCAGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GACATGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATCATGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 21 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 25 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 34 47 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 25 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 71 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 GG Context M 3.9794 4 1 +exampleBAM.bam.bam 9 16 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCCAGTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCACATG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TAAGTGAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGACATG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 55 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 59 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CATGATCG Context I 3.0103 1 0 +exampleBAM.bam.bam 16 AT Context M 3.0103 1 0 +exampleBAM.bam.bam 32 43 Cycle M 6.0206 3 0 +exampleBAM.bam.bam 19 33 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 GA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 GTATTTGC Context D 3.0103 1 0 +exampleBAM.bam.bam 26 TA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TCTTAAGT Context D 3.0103 1 0 +exampleBAM.bam.bam 33 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 11 20 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 28 61 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 18 1 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ACCCAGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AAAGACAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCCTTTGC Context D 3.0103 1 0 +exampleBAM.bam.bam 27 16 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 TG Context M 4.7712 2 0 +exampleBAM.bam.bam 32 CT Context M 3.0103 1 0 +exampleBAM.bam.bam 21 44 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATTACTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGCTGG Context I 3.0103 1 0 +exampleBAM.bam.bam 16 65 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 GG Context M 4.7712 2 0 +exampleBAM.bam.bam 25 21 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 22 9 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CAGGCCAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 20 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 24 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 30 26 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGTATTT Context D 3.0103 1 0 +exampleBAM.bam.bam 24 53 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 19 70 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 25 55 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AGGCCACC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 54 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 58 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 ACTTTCAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AAAGTGCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATTGATAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AATGTGAA Context I 3.0103 1 0 +exampleBAM.bam.bam 9 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 19 32 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 28 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CGGGTTTG Context I 4.7712 2 0 +exampleBAM.bam.bam 45 TCTTTGTA Context I 3.0103 1 0 +exampleBAM.bam.bam 33 10 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 CA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 GTTCGGGT Context I 6.0206 3 0 +exampleBAM.bam.bam 27 TT Context M 4.7712 2 0 +exampleBAM.bam.bam 27 17 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CAGCAAAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GGCAGCCT Context I 3.0103 1 0 +exampleBAM.bam.bam 20 GT Context M -0.0000 1 1 +exampleBAM.bam.bam 45 TGGAGCCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGTGGCC Context I 3.0103 1 0 +exampleBAM.bam.bam 28 30 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 40 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 TG Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TGTGTCTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCAATAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCTCCAGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 49 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 61 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CCTCGTCC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGCACCCA Context I 3.0103 1 0 +exampleBAM.bam.bam 22 44 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 45 AGGTTATC Context I 3.0103 1 0 +exampleBAM.bam.bam 34 41 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 65 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 12 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTGTGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGTTGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 24 50 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTTCACA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCGGGTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TAGGGTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 33 73 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 9 52 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 19 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 31 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 25 TA Context M 6.0206 3 0 +exampleBAM.bam.bam 34 11 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 28 25 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TAGATTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTGGGG Context I 4.7712 2 0 +exampleBAM.bam.bam 45 GGCTGGGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GATTAGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 5 GG Context M 3.0103 3 1 +exampleBAM.bam.bam 32 15 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 22 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 42 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 5 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 AT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTCAGGC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGCCAGGC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTCTTTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGAACTGG Context I 3.0103 1 0 +exampleBAM.bam.bam 26 20 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATTCTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGATAACC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATTTTTCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGCTTTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 5 46 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 29 27 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATCCATTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 48 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 60 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GATCCAGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AATGAGTC Context D 3.0103 1 0 +exampleBAM.bam.bam 24 TT Context M 3.0103 3 1 +exampleBAM.bam.bam 45 TCTTTATA Context I 3.0103 1 0 +exampleBAM.bam.bam 6 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 23 GT Context M 4.7712 2 0 +exampleBAM.bam.bam 34 40 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 18 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 30 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CAAAATCT Context I 3.0103 1 0 +exampleBAM.bam.bam 22 15 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCAGGTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCATGGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCTAATCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TAGGGTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 33 72 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 60 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 CA Context M 6.9897 4 0 +exampleBAM.bam.bam 45 CCCAGATC Context D 3.0103 1 0 +exampleBAM.bam.bam 18 36 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 16 70 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGTATTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 33 46 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTGGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTTTGGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTAGAG Context I 3.0103 1 0 +exampleBAM.bam.bam 19 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 32 GA Context M 4.7712 2 0 +exampleBAM.bam.bam 32 14 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 12 62 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 12 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTGGCCT Context I 3.0103 1 0 +exampleBAM.bam.bam 4 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 27 53 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 23 GA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTATTATT Context I 3.0103 1 0 +exampleBAM.bam.bam 5 74 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATGATAAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 51 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 63 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CACCCAGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CGTGAGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCTTTATT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATGGTGGC Context D 3.0103 1 0 +exampleBAM.bam.bam 34 CT Context M 4.7712 2 0 +exampleBAM.bam.bam 4 72 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCGGGTTT Context I 4.7712 2 0 +exampleBAM.bam.bam 24 48 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCCATGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CACATGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 17 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 29 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 ATCAATAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ACCATGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 32 GT Context M 8.4510 6 0 +exampleBAM.bam.bam 19 7 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 45 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 28 27 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCCATTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GATAACCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AACTGGGA Context I 3.0103 1 0 +exampleBAM.bam.bam 4 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 33 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TCAGGCCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGCACTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCACTGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTCCAGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 6 CT Context M 3.0103 1 0 +exampleBAM.bam.bam 23 15 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 51 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 72 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 42 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GATATAAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTAGAGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 50 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 62 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GCCACCAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGGTTCGG Context D 6.0206 3 0 +exampleBAM.bam.bam 24 TC Context M 6.0206 3 0 +exampleBAM.bam.bam 25 TT Context M 4.7712 2 0 +exampleBAM.bam.bam 45 16 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 28 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 ACATGGTA Context I 3.0103 1 0 +exampleBAM.bam.bam 16 34 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 AATCTCCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATTTCACT Context I 3.0103 1 0 +exampleBAM.bam.bam 22 GT Context M 4.7712 2 0 +exampleBAM.bam.bam 45 ATATCAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAATGTGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GAGTCAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 24 49 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGGGGTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TAGGGTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGCAATCC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGGTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTAATGAG Context I 3.0103 1 0 +exampleBAM.bam.bam 30 30 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 75 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 GG Context M 7.7815 5 0 +exampleBAM.bam.bam 20 9 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 20 CT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 ATTAGATT Context D 3.0103 1 0 +exampleBAM.bam.bam 33 44 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTCTGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGAGATT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTTGGGC Context I 3.0103 1 0 +exampleBAM.bam.bam 21 11 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 24 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 46 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 55 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATATAAAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GAGTTTCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CACTTTCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CCATTTCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CCAGGCAC Context D 3.0103 1 0 +exampleBAM.bam.bam 11 TT Context M -0.0000 1 1 +exampleBAM.bam.bam 45 TTTCACTG Context I 3.0103 1 0 +exampleBAM.bam.bam 33 GA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TCGTGAGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TACTCTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TAATGAGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGTCTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGCTTTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 22 70 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATTTTTCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGCCAGGC Context I 3.0103 1 0 +exampleBAM.bam.bam 33 1 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 45 TTTCAGGC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TATTCTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGATAACC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTCTTTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGAACTGG Context D 3.0103 1 0 +exampleBAM.bam.bam 21 AG Context M 4.7712 2 0 +exampleBAM.bam.bam 32 33 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 27 56 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGCTGGGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GATTAGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 33 35 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TAGATTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTGGGG Context D 4.7712 2 0 +exampleBAM.bam.bam 19 CT Context M 1.7609 2 1 +exampleBAM.bam.bam 45 19 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 31 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 TGTTGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTGTGT Context I 3.0103 1 0 +exampleBAM.bam.bam 24 62 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCGGGTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTTCACA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TAGGGTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 30 TT Context M 4.7712 2 0 +exampleBAM.bam.bam 30 17 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 33 69 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 36 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 17 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 21 64 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 16 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CCTCGTCC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 49 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 61 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 AGGTTATC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGCACCCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGTGTCTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCAATAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTCCAGG Context D 3.0103 1 0 +exampleBAM.bam.bam 6 AA Context M 4.7712 2 0 +exampleBAM.bam.bam 31 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 31 19 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 8 58 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 28 54 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTGGCCT Context D 3.0103 1 0 +exampleBAM.bam.bam 18 10 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 18 CA Context M 4.7712 2 0 +exampleBAM.bam.bam 27 57 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 AT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TGTATTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTAGAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTGGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTTGGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 13 TA Context M 3.0103 1 0 +exampleBAM.bam.bam 20 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CCCAGATC Context I 3.0103 1 0 +exampleBAM.bam.bam 32 2 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 27 27 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 67 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TAGGGTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCATGGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTAATCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 30 TG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 18 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 30 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CCAGGTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAAAATCT Context D 3.0103 1 0 +exampleBAM.bam.bam 25 31 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 6 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 17 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 23 35 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCTTTATA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GATCCAGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 48 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 60 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 ATCCATTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AATGAGTC Context I 3.0103 1 0 +exampleBAM.bam.bam 31 TA Context M 4.7712 2 0 +exampleBAM.bam.bam 21 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 34 65 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CTCCAGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 18 CT Context M 3.0103 1 0 +exampleBAM.bam.bam 33 3 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCAGGCCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTGCACTT Context D 3.0103 1 0 +exampleBAM.bam.bam 28 53 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTCACTGA Context D 3.0103 1 0 +exampleBAM.bam.bam 19 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 32 1 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GATAACCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AACTGGGA Context D 3.0103 1 0 +exampleBAM.bam.bam 16 73 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCCATTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 21 66 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 5 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 AT Context M 8.4510 6 0 +exampleBAM.bam.bam 16 47 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CACATGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 17 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 29 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 ATCAATAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACCATGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCGGGTTT Context D 4.7712 2 0 +exampleBAM.bam.bam 45 TCCATGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 6 AG Context M -0.0000 1 1 +exampleBAM.bam.bam 6 4 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 ATGATAAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 51 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 63 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CGTGAGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CACCCAGA Context D 3.0103 1 0 +exampleBAM.bam.bam 16 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 5 70 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GCTTTATT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATGGTGGC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTATTATT Context D 3.0103 1 0 +exampleBAM.bam.bam 34 64 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 AC Context M 6.0206 3 0 +exampleBAM.bam.bam 33 2 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTCACTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCGTGAGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTGTCTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TAATGAGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TACTCTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CACTTTCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCATTTCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATATAAAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GAGTTTCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCAGGCAC Context I 3.0103 1 0 +exampleBAM.bam.bam 29 54 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 65 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 10 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 CA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TTTCTGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 33 32 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTTGGGC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGGAGATT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATTAGATT Context I 3.0103 1 0 +exampleBAM.bam.bam 34 4 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 67 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGGTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGCAATCC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGGGGTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TAGGGTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTAATGAG Context D 3.0103 1 0 +exampleBAM.bam.bam 30 18 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 TA Context M 6.9897 4 0 +exampleBAM.bam.bam 45 16 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 28 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 ACATGGTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GAGTCAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CAATGTGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AATCTCCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATTTCACT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATATCAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 8 57 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 34 38 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 16 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 45 GGGTTCGG Context I 6.0206 3 0 +exampleBAM.bam.bam 45 CTAGAGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 50 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 62 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 GATATAAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCCACCAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACCTGGAG Context I 3.0103 1 0 +exampleBAM.bam.bam 5 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 AGGTGGAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCAAAATC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CACAGCAA Context I 3.0103 1 0 +exampleBAM.bam.bam 28 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 33 39 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 23 64 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 27 30 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 AAGTGACA Context D 3.0103 1 0 +exampleBAM.bam.bam 5 38 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AGAGTTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGTGACAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCCTGAAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTCTTTGT Context I 3.0103 1 0 +exampleBAM.bam.bam 33 AT Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TGGCAGCC Context I 3.0103 1 0 +exampleBAM.bam.bam 4 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 29 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 34 71 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AGTTTCAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CATTTCAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 53 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 57 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CATGATAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TAGAGTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTCGGG Context D 6.0206 3 0 +exampleBAM.bam.bam 45 CTTTATTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTGTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGCCTCGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTGTGTCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTTAAGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATTCTATT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTAATCTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 23 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 27 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 30 21 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGAAAGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGTATTA Context I 3.0103 1 0 +exampleBAM.bam.bam 23 38 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 3 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTAGGG Context I 4.7712 2 0 +exampleBAM.bam.bam 45 GTGCAAAG Context D 3.0103 1 0 +exampleBAM.bam.bam 28 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 45 ATTCTTAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GAGCCTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 27 31 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 48 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 19 GG Context M 4.7712 2 0 +exampleBAM.bam.bam 4 37 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGGTTTGG Context I 4.7712 2 0 +exampleBAM.bam.bam 33 AG Context M 6.0206 3 0 +exampleBAM.bam.bam 28 50 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATTACTCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ACACAGCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATGTGAAC Context I 3.0103 1 0 +exampleBAM.bam.bam 32 36 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 29 TA Context M 4.7712 2 0 +exampleBAM.bam.bam 34 70 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 17 76 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 30 54 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 25 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATCGTGAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GATCGTGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 52 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 56 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CCAGATCC Context D 3.0103 1 0 +exampleBAM.bam.bam 16 CA Context M 3.0103 1 0 +exampleBAM.bam.bam 8 63 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 14 TG Context M 3.0103 1 0 +exampleBAM.bam.bam 23 AT Context M 6.0206 3 0 +exampleBAM.bam.bam 19 72 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 20 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTATTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTCAATGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AAAATCTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 22 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 26 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 34 2 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 6 68 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 23 66 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 28 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 AT Context M 4.7712 2 0 +exampleBAM.bam.bam 5 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TATTACTC Context D 3.0103 1 0 +exampleBAM.bam.bam 33 37 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGCTGG Context D 3.0103 1 0 +exampleBAM.bam.bam 28 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 4 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 29 TT Context M 4.7712 2 0 +exampleBAM.bam.bam 18 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 AAAGACAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCCTTTGC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACCCAGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTTAAGT Context I 3.0103 1 0 +exampleBAM.bam.bam 13 55 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTATTTGC Context I 3.0103 1 0 +exampleBAM.bam.bam 33 7 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 23 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 8 60 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 22 38 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CATGATCG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 55 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 59 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 TCCAGTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTGACATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCACATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TAAGTGAC Context D 3.0103 1 0 +exampleBAM.bam.bam 4 64 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 25 24 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 22 AG Context M 4.7712 2 0 +exampleBAM.bam.bam 45 CTTTCAGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATCATGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 21 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 25 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 GACATGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 30 23 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 67 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 56 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATTATTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTTAATGA Context D 3.0103 1 0 +exampleBAM.bam.bam 32 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 23 67 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGGAGCCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGGTGGCC Context D 3.0103 1 0 +exampleBAM.bam.bam 28 TA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CAGCAAAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGCAGCCT Context D 3.0103 1 0 +exampleBAM.bam.bam 34 68 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 3 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCTTTGTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTCGGGT Context D 6.0206 3 0 +exampleBAM.bam.bam 28 48 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 18 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CGGGTTTG Context D 4.7712 2 0 +exampleBAM.bam.bam 34 34 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 30 52 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 27 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AGGCCACC Context D 3.0103 1 0 +exampleBAM.bam.bam 20 69 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AAAGTGCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATTGATAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AATGTGAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 54 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 58 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 ACTTTCAG Context D 3.0103 1 0 +exampleBAM.bam.bam 23 37 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 71 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 66 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 15 TG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGTATTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 20 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 24 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CAGGCCAC Context I 3.0103 1 0 +exampleBAM.bam.bam 23 59 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 17 20 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 CG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGATATA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTTAAG Context I 3.0103 1 0 +exampleBAM.bam.bam 15 14 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GAACTGGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 6 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 10 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GGGCTGGG Context D 3.0103 1 0 +exampleBAM.bam.bam 31 10 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 60 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 37 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 31 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 30 42 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTCTAGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TATTTGCA Context D 3.0103 1 0 +exampleBAM.bam.bam 24 5 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCTTTGCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAGGCACC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 36 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 40 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 29 GA Context M 4.7712 2 0 +exampleBAM.bam.bam 21 29 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TAATCTCC Context I 3.0103 1 0 +exampleBAM.bam.bam 15 74 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 33 24 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCTGGGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 66 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CTTGGCTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGCCACCA Context D 3.0103 1 0 +exampleBAM.bam.bam 19 TG Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TTCAGGCC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTAATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GGTGGAGC Context I 3.0103 1 0 +exampleBAM.bam.bam 28 GG Context M 6.0206 3 0 +exampleBAM.bam.bam 45 GAGATTAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 7 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 11 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 TTACTCTT Context I 3.0103 1 0 +exampleBAM.bam.bam 30 9 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTATATC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGTTAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTATTACT Context D 3.0103 1 0 +exampleBAM.bam.bam 31 11 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 34 61 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 36 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ACAGCAAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGTGCAAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 37 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 41 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 TCCAGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGAGTGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTATCATG Context D 3.0103 1 0 +exampleBAM.bam.bam 24 AG Context M 4.7712 2 0 +exampleBAM.bam.bam 29 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 32 57 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 67 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 18 19 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CTGGAGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGATTTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AAATCTAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTGAAAGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGGCACCC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTGTGTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGCTG Context D 3.0103 1 0 +exampleBAM.bam.bam 28 47 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGGGG Context I 3.0103 1 0 +exampleBAM.bam.bam 19 TT Context M 4.7712 2 0 +exampleBAM.bam.bam 29 45 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCTGGAGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATGATTCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCCAGGCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTATTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 33 59 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCTATTCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TAACCTGG Context I 3.0103 1 0 +exampleBAM.bam.bam 30 CA Context M 6.0206 3 0 +exampleBAM.bam.bam 15 GG Context M 4.7712 2 0 +exampleBAM.bam.bam 45 GACACAGC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AACCTGGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 4 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 8 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 25 AT Context M 4.7712 2 0 +exampleBAM.bam.bam 6 63 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 45 TTTGCAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGCACT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTAAGTGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGAGTCAA Context I 3.0103 1 0 +exampleBAM.bam.bam 22 59 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CTCGTCCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 38 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 42 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 34 62 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 CG Context M 3.0103 1 0 +exampleBAM.bam.bam 31 8 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 27 69 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 26 3 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATAAAGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGGGTTGG Context D 4.7712 2 0 +exampleBAM.bam.bam 45 64 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 76 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GATTCTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGACACAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGGGTTGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGTGTTGG Context D 3.0103 1 0 +exampleBAM.bam.bam 29 12 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 GG Context M 6.9897 4 0 +exampleBAM.bam.bam 8 71 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTGAACTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGCTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 9 69 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCTGAAAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTGCAC Context D 3.0103 1 0 +exampleBAM.bam.bam 20 29 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 12 40 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 24 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 61 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CATGGTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCACCCAG Context D 3.0103 1 0 +exampleBAM.bam.bam 16 55 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATGATCGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 5 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 9 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 30 CC Context M 4.7712 2 0 +exampleBAM.bam.bam 23 56 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 62 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 43 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 ATAACCTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 39 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 43 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GAAAGTGC Context D 3.0103 1 0 +exampleBAM.bam.bam 24 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 24 6 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 45 TTATTGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 34 63 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 CT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 65 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 18 TT Context M -0.0000 1 1 +exampleBAM.bam.bam 45 GATTTTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGTTCTAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TAAAGACA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGAGTGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTCACAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGGAGCC Context D 3.0103 1 0 +exampleBAM.bam.bam 19 49 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 GT Context M 4.7712 2 0 +exampleBAM.bam.bam 5 26 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 AAGTGCAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATTTGCAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATCTAATC Context I 3.0103 1 0 +exampleBAM.bam.bam 20 28 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 GGTATTAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGTGAACT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGGCCTGA Context I 3.0103 1 0 +exampleBAM.bam.bam 33 57 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 60 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 47 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 56 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 GA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TCGTCCAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGATTCTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATCCAGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 32 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 44 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CATGATTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAATCCAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAGTTCTA Context I 3.0103 1 0 +exampleBAM.bam.bam 34 26 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 8 AT Context M -0.0000 1 1 +exampleBAM.bam.bam 45 GGGTTAGG Context D 4.7712 2 0 +exampleBAM.bam.bam 30 12 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATATCAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCAATCCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGAGCCTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAGATCCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 2 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 14 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GAGTGTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 32 30 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 21 59 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGTCTTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCAATGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGCTTTA Context I 3.0103 1 0 +exampleBAM.bam.bam 13 GA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CCATGATT Context D 3.0103 1 0 +exampleBAM.bam.bam 29 CA Context M 3.0103 1 0 +exampleBAM.bam.bam 19 54 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATCAATA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGGGCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGTTAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGCACTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTAGAGT Context I 3.0103 1 0 +exampleBAM.bam.bam 26 AT Context M 3.0103 1 0 +exampleBAM.bam.bam 20 57 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GCCTCGTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 70 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 74 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 18 22 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 32 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 66 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 15 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 31 GC Context M 6.0206 3 0 +exampleBAM.bam.bam 45 33 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 45 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GGAGATTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGATCCAG Context D 3.0103 1 0 +exampleBAM.bam.bam 16 19 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 ATGGTATT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATCTCCAG Context D 3.0103 1 0 +exampleBAM.bam.bam 13 75 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGTATT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TATCATGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGACATGG Context I 3.0103 1 0 +exampleBAM.bam.bam 17 TT Context M 3.0103 3 1 +exampleBAM.bam.bam 31 45 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 8 AG Context M 4.7712 2 0 +exampleBAM.bam.bam 34 27 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 3 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 15 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 TTATATCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGATATAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTATCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCACTGAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGGCCTG Context D 3.0103 1 0 +exampleBAM.bam.bam 19 21 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 32 31 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CACTGATG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATAAAGAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCACTTTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CAGCCTCG Context I 3.0103 1 0 +exampleBAM.bam.bam 28 CT Context M 4.7712 2 0 +exampleBAM.bam.bam 45 71 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 75 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 AGCAAAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGCAATC Context I 3.0103 1 0 +exampleBAM.bam.bam 33 29 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 26 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTTGGG Context D 4.7712 2 0 +exampleBAM.bam.bam 45 GGGTTGGG Context D 6.0206 3 0 +exampleBAM.bam.bam 24 3 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTTTCTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTAGATTT Context D 3.0103 1 0 +exampleBAM.bam.bam 16 TG Context M 4.7712 2 0 +exampleBAM.bam.bam 45 34 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 46 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 ATGAGTCA Context D 3.0103 1 0 +exampleBAM.bam.bam 27 65 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 12 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 GG Context M 6.9897 4 0 +exampleBAM.bam.bam 34 58 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 33 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 15 8 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 26 67 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 GA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 12 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GGCCTGAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGATTAGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCAGCCTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CATGGTGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AATCCATT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTATAT Context D 3.0103 1 0 +exampleBAM.bam.bam 29 76 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 61 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 28 CA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 GTTAGGGT Context I 6.0206 3 0 +exampleBAM.bam.bam 45 ACTCTTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGCCTTTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACATGATC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATTATTGA Context D 3.0103 1 0 +exampleBAM.bam.bam 32 28 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 29 42 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 AT Context M 6.9897 4 0 +exampleBAM.bam.bam 45 TGGGTTAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGTTCG Context D 3.0103 1 0 +exampleBAM.bam.bam 26 7 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTTCTGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGGGTTAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGGGTTCG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CGGGTTCG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 68 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 72 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 AGTCAATG Context I 3.0103 1 0 +exampleBAM.bam.bam 29 8 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 CG Context M 4.7712 2 0 +exampleBAM.bam.bam 4 29 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 16 TT Context M 3.9794 4 1 +exampleBAM.bam.bam 45 CACCATGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 35 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 47 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CTATTCTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AATCTAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGTTGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 30 45 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCACATGA Context I 3.0103 1 0 +exampleBAM.bam.bam 9 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GTCCATGA Context I 3.0103 1 0 +exampleBAM.bam.bam 31 13 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 34 59 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AAGACACA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCACCATG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 1 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 13 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 16 51 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CGTCCATG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTGGGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTCGGGTT Context I 6.0206 3 0 +exampleBAM.bam.bam 45 TTAGGGTT Context I 6.0206 3 0 +exampleBAM.bam.bam 45 TGGGGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 9 38 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTATCAT Context I 3.0103 1 0 +exampleBAM.bam.bam 30 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 17 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 34 25 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCATGATA Context D 3.0103 1 0 +exampleBAM.bam.bam 28 11 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATTGATA Context D 3.0103 1 0 +exampleBAM.bam.bam 29 43 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCAGTTCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAGGTTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 69 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 73 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 28 41 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 31 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGATCGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 29 9 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 12 GC Context M 3.0103 1 0 +exampleBAM.bam.bam 29 6 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GCCTCGTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 70 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 74 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 TTTGGGCT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TATCAATA Context D 3.0103 1 0 +exampleBAM.bam.bam 33 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 45 TTGGTTAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTAGAGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGCACTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 4 49 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 18 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 10 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 27 11 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CCATGATT Context I 3.0103 1 0 +exampleBAM.bam.bam 5 TT Context M 1.7609 2 1 +exampleBAM.bam.bam 18 56 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGGCTTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGTCTTTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCAATGTG Context D 3.0103 1 0 +exampleBAM.bam.bam 12 68 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 32 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGAGCCTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CAGATCCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 2 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 14 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 GCAATCCA Context I 3.0103 1 0 +exampleBAM.bam.bam 22 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GAGTGTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 15 AA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 GGGTTAGG Context I 4.7712 2 0 +exampleBAM.bam.bam 45 TATATCAA Context D 3.0103 1 0 +exampleBAM.bam.bam 17 62 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CATGATTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 32 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 44 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 ATCCAGTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CAGTTCTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CAATCCAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGATTCTA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCGTCCAT Context I 3.0103 1 0 +exampleBAM.bam.bam 24 GT Context M 4.7712 2 0 +exampleBAM.bam.bam 24 13 Cycle M 6.0206 3 0 +exampleBAM.bam.bam 30 34 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 29 7 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 49 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 74 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 40 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 28 39 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGCAATC Context D 3.0103 1 0 +exampleBAM.bam.bam 33 TT Context M 6.9897 4 0 +exampleBAM.bam.bam 30 69 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 71 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 75 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 AGCAAAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 32 19 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 TC Context M 6.0206 3 0 +exampleBAM.bam.bam 29 37 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 CA Context M 4.7712 2 0 +exampleBAM.bam.bam 45 ATAAAGAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CACTGATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CAGCCTCG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCACTTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 25 14 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 23 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 6 52 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 TGATATAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTATCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTATATCA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCACTGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTGGCCTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 3 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 15 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 17 63 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 TG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGTATT Context D 3.0103 1 0 +exampleBAM.bam.bam 24 GG Context M 4.7712 2 0 +exampleBAM.bam.bam 30 35 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 45 TATCATGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGACATGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGATCCAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 33 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 45 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 GGAGATTA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATGGTATT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATCTCCAG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CGGGTTCG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGGGTTAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGGGTTCG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 68 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 72 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 AGTCAATG Context D 3.0103 1 0 +exampleBAM.bam.bam 33 18 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 TA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGTTAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGGGTTCG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTTCTGT Context D 3.0103 1 0 +exampleBAM.bam.bam 4 TT Context M -0.0000 1 1 +exampleBAM.bam.bam 29 4 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 73 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AGCCTTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ACTCTTTG Context D 3.0103 1 0 +exampleBAM.bam.bam 18 58 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 ATTATTGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACATGATC Context I 3.0103 1 0 +exampleBAM.bam.bam 28 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 33 48 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTTAGGGT Context D 6.0206 3 0 +exampleBAM.bam.bam 32 16 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 32 TG Context M 4.7712 2 0 +exampleBAM.bam.bam 45 GGCCTGAA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 12 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 AGATTAGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCAGCCTC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AATCCATT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTATAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CATGGTGG Context I 3.0103 1 0 +exampleBAM.bam.bam 22 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 24 45 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 GT Context M 6.0206 3 0 +exampleBAM.bam.bam 31 34 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 20 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 34 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 46 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 ATGAGTCA Context I 3.0103 1 0 +exampleBAM.bam.bam 22 51 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTTTCTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGGTTGGG Context I 6.0206 3 0 +exampleBAM.bam.bam 45 GGTTTGGG Context I 4.7712 2 0 +exampleBAM.bam.bam 45 TTAGATTT Context I 3.0103 1 0 +exampleBAM.bam.bam 30 32 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 19 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 TC Context M 3.0103 1 0 +exampleBAM.bam.bam 25 47 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 10 75 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 11 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 33 TC Context M 8.4510 6 0 +exampleBAM.bam.bam 45 TGATCGTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CAGGTTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CCAGTTCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 69 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 73 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 32 51 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 29 AT Context M 4.7712 2 0 +exampleBAM.bam.bam 29 5 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 33 49 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATTGATA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCATGATA Context I 3.0103 1 0 +exampleBAM.bam.bam 32 TT Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TGGGGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTAGGGTT Context D 6.0206 3 0 +exampleBAM.bam.bam 45 TTCGGGTT Context D 6.0206 3 0 +exampleBAM.bam.bam 45 TTGGGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTATCAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CGTCCATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCACCATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AAGACACA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 1 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 13 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CTGGGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 22 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 25 GG Context M 4.7712 2 0 +exampleBAM.bam.bam 8 CA Context M 3.0103 1 0 +exampleBAM.bam.bam 34 21 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 GA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GTGTTGGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TCACATGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTCCATGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CACCATGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 35 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 47 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CTATTCTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AATCTAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 25 46 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 76 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 55 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 1 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 23 18 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 66 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GAGATTAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTCAGGCC Context D 3.0103 1 0 +exampleBAM.bam.bam 13 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTTAATG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGTGGAGC Context D 3.0103 1 0 +exampleBAM.bam.bam 21 TT Context M 3.0103 1 0 +exampleBAM.bam.bam 21 17 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 12 AG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 GGCCACCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GCTGGGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTTGGCTT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 66 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 26 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TAATCTCC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 28 34 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGGGT Context D 3.0103 1 0 +exampleBAM.bam.bam 17 58 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 6 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CCTTTGCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 36 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 40 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CAGGCACC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTCTAGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TATTTGCA Context I 3.0103 1 0 +exampleBAM.bam.bam 34 TA Context M 3.0103 1 0 +exampleBAM.bam.bam 25 CC Context M 3.0103 1 0 +exampleBAM.bam.bam 22 23 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GAACTGGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 6 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 10 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 GGGCTGGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTGATATA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTCTTAAG Context D 3.0103 1 0 +exampleBAM.bam.bam 27 GA Context M 4.7712 2 0 +exampleBAM.bam.bam 27 14 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 32 23 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 50 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TAACCTGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TCTATTCT Context I 3.0103 1 0 +exampleBAM.bam.bam 11 40 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 TTTATTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATGATTCT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CCTGGAGA Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GCCAGGCA Context D 3.0103 1 0 +exampleBAM.bam.bam 12 AT Context M 3.0103 1 0 +exampleBAM.bam.bam 32 53 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 21 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 26 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TCTGTGTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTTGGGGG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGGCTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AAATCTAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 67 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 CTGGAGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGATTTTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGGCACCC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 CTGAAAGT Context I 3.0103 1 0 +exampleBAM.bam.bam 8 46 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TCCAGGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTGAGTGT Context I 3.0103 1 0 +exampleBAM.bam.bam 24 CG Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTATCATG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ACAGCAAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 37 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 41 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 AGTGCAAA Context I 3.0103 1 0 +exampleBAM.bam.bam 34 TC Context M 6.0206 3 0 +exampleBAM.bam.bam 25 CA Context M 3.0103 1 0 +exampleBAM.bam.bam 30 AT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 TTTATATC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTACTCTT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GTATTACT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGTTAAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 7 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 11 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 CCTGAAAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 CTTTGCAC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GTGAACTG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TTGGCTTT Context I 3.0103 1 0 +exampleBAM.bam.bam 28 2 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 19 30 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 GT Context M 3.0103 1 0 +exampleBAM.bam.bam 45 64 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 76 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 AGTGTTGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGGGTTGG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GATTCTAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 AGACACAG Context D 3.0103 1 0 +exampleBAM.bam.bam 45 GGGGTTGG Context I 4.7712 2 0 +exampleBAM.bam.bam 15 68 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TATAAAGA Context I 3.0103 1 0 +exampleBAM.bam.bam 33 22 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 12 AA Context M 3.0103 1 0 +exampleBAM.bam.bam 32 54 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 CTCGTCCA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 38 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 42 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 TTAAGTGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGCAAT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTGCACT Context D 3.0103 1 0 +exampleBAM.bam.bam 24 CC Context M 4.7712 2 0 +exampleBAM.bam.bam 45 TGAGTCAA Context D 3.0103 1 0 +exampleBAM.bam.bam 6 TT Context M 1.7609 2 1 +exampleBAM.bam.bam 31 4 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 31 AG Context M 4.7712 2 0 +exampleBAM.bam.bam 34 50 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 73 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GACACAGC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AACCTGGA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 4 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 8 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 16 58 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 AA Context M 4.7712 2 0 +exampleBAM.bam.bam 24 41 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 TG Context M 6.0206 3 0 +exampleBAM.bam.bam 29 68 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 25 9 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 26 44 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GGTATTAC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 TGTGAACT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TGGCCTGA Context D 3.0103 1 0 +exampleBAM.bam.bam 5 22 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 AAGTGCAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATTTGCAA Context I 3.0103 1 0 +exampleBAM.bam.bam 45 ATCTAATC Context D 3.0103 1 0 +exampleBAM.bam.bam 27 GG Context M 3.0103 1 0 +exampleBAM.bam.bam 21 48 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TGAGTGTT Context D 3.0103 1 0 +exampleBAM.bam.bam 13 39 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 TAAAGACA Context D 3.0103 1 0 +exampleBAM.bam.bam 33 23 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 45 GTGGAGCC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 TTTCACAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 65 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 GATTTTTC Context D 3.0103 1 0 +exampleBAM.bam.bam 45 AGTTCTAG Context I 3.0103 1 0 +exampleBAM.bam.bam 19 61 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 28 71 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 15 35 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 24 CA Context M 3.0103 1 0 +exampleBAM.bam.bam 24 10 Cycle M -0.0000 1 1 +exampleBAM.bam.bam 45 TTATTGAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATAACCTG Context I 3.0103 1 0 +exampleBAM.bam.bam 45 GAAAGTGC Context I 3.0103 1 0 +exampleBAM.bam.bam 45 39 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 43 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 31 AT Context M 4.7712 2 0 +exampleBAM.bam.bam 31 5 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 34 51 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 27 72 Cycle M 3.0103 1 0 +exampleBAM.bam.bam 30 AC Context M 3.0103 1 0 +exampleBAM.bam.bam 45 CATGGTAT Context D 3.0103 1 0 +exampleBAM.bam.bam 45 ATGATCGT Context I 3.0103 1 0 +exampleBAM.bam.bam 45 5 Cycle D 7.7815 5 0 +exampleBAM.bam.bam 45 9 Cycle I 7.7815 5 0 +exampleBAM.bam.bam 45 GCACCCAG Context I 3.0103 1 0 +exampleBAM.bam.bam 34 TT Context M 8.4510 6 0 +exampleBAM.bam.bam 31 39 Cycle M 4.7712 2 0 +exampleBAM.bam.bam 14 33 Cycle M 3.0103 1 0 + diff --git a/public/testdata/gatkrunreport.fail.xml b/public/testdata/gatkrunreport.fail.xml new file mode 100644 index 000000000..ba8228c3d --- /dev/null +++ b/public/testdata/gatkrunreport.fail.xml @@ -0,0 +1,50 @@ + + yX3AnltsqIlXH9kAQqTWHQUd8CQ5bikz + + Failed to parse Genome Location string: + 20:10,000,000-10,000,001x + + org.broadinstitute.sting.utils.GenomeLocParser.parseGenomeLoc(GenomeLocParser.java:377) + org.broadinstitute.sting.utils.interval.IntervalUtils.parseIntervalArguments(IntervalUtils.java:82) + org.broadinstitute.sting.commandline.IntervalBinding.getIntervals(IntervalBinding.java:106) + org.broadinstitute.sting.gatk.GenomeAnalysisEngine.loadIntervals(GenomeAnalysisEngine.java:618) + org.broadinstitute.sting.gatk.GenomeAnalysisEngine.initializeIntervals(GenomeAnalysisEngine.java:585) + org.broadinstitute.sting.gatk.GenomeAnalysisEngine.execute(GenomeAnalysisEngine.java:231) + org.broadinstitute.sting.gatk.CommandLineExecutable.execute(CommandLineExecutable.java:128) + org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:236) + org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:146) + org.broadinstitute.sting.gatk.CommandLineGATK.main(CommandLineGATK.java:92) + + + Position: '10,000,001x' contains invalid + chars. + + org.broadinstitute.sting.utils.GenomeLocParser.parsePosition(GenomeLocParser.java:411) + org.broadinstitute.sting.utils.GenomeLocParser.parseGenomeLoc(GenomeLocParser.java:374) + org.broadinstitute.sting.utils.interval.IntervalUtils.parseIntervalArguments(IntervalUtils.java:82) + org.broadinstitute.sting.commandline.IntervalBinding.getIntervals(IntervalBinding.java:106) + org.broadinstitute.sting.gatk.GenomeAnalysisEngine.loadIntervals(GenomeAnalysisEngine.java:618) + org.broadinstitute.sting.gatk.GenomeAnalysisEngine.initializeIntervals(GenomeAnalysisEngine.java:585) + org.broadinstitute.sting.gatk.GenomeAnalysisEngine.execute(GenomeAnalysisEngine.java:231) + org.broadinstitute.sting.gatk.CommandLineExecutable.execute(CommandLineExecutable.java:128) + org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:236) + org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:146) + org.broadinstitute.sting.gatk.CommandLineGATK.main(CommandLineGATK.java:92) + + false + + true + + 2012/03/10 20.19.52 + 2012/03/10 20.19.52 + 0 + CountReads + 1.4-483-g63ecdb2 + 85000192 + 129957888 + depristo + 10.0.1.10 + Apple Inc.-1.6.0_26 + Mac OS X-x86_64 + 0 + diff --git a/public/testdata/gatkrunreport.success.xml b/public/testdata/gatkrunreport.success.xml new file mode 100644 index 000000000..8f89eaf46 --- /dev/null +++ b/public/testdata/gatkrunreport.success.xml @@ -0,0 +1,15 @@ + + D7D31ULwTSxlAwnEOSmW6Z4PawXwMxEz + 2012/03/10 20.21.19 + 2012/03/10 20.21.19 + 0 + CountReads + 1.4-483-g63ecdb2 + 85000192 + 129957888 + depristo + 10.0.1.10 + Apple Inc.-1.6.0_26 + Mac OS X-x86_64 + 105 +