Merge remote-tracking branch 'unstable/master'

This commit is contained in:
Eric Banks 2012-05-01 08:47:08 -04:00
commit ef082356e9
288 changed files with 19201 additions and 8010 deletions

View File

@ -955,8 +955,8 @@
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
<jvmarg line="${cofoja.jvm.args}"/>
<!-- <jvmarg value="-Xdebug"/> -->
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
<!-- <jvmarg value="-Xdebug"/> -->
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
<classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">

View File

@ -1,4 +1,4 @@
Copyright (c) 2011 The Broad Institute
Copyright (c) 2012 The Broad Institute
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation

Binary file not shown.

View File

@ -2,19 +2,19 @@
.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) {
d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE);
colnames(d) = tableHeader;
for (i in 1:ncol(d)) {
# use the general type.convert infrastructure of read.table to convert column data to R types
v = type.convert(d[,i])
d[,i] = v;
}
usedNames = ls(envir=tableEnv, pattern=tableName);
if (length(usedNames) > 0) {
tableName = paste(tableName, ".", length(usedNames), sep="");
}
assign(tableName, d, envir=tableEnv);
}
@ -28,74 +28,163 @@
starts = c(1, columnStarts);
stops = c(columnStarts - 1, nchar(line));
sapply(line, splitStartStop)[,1];
}
# Old implementaton for v0.*
gsa.read.gatkreportv0 <- function(lines) {
tableEnv = new.env();
tableName = NA;
tableHeader = c();
tableRows = c();
version = NA;
for (line in lines) {
if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
headerFields = unlist(strsplit(line, "[[:space:]]+"));
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
tableName = headerFields[2];
tableHeader = c();
tableRows = c();
# For differences in versions see
# $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
version = "v0.1";
} else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
version = "v0.2";
columnStarts = c();
}
} else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
# do nothing
} else if (!is.na(tableName)) {
if (version == "v0.1") {
row = unlist(strsplit(line, "[[:space:]]+"));
} else if (version == "v0.2") {
if (length(tableHeader) == 0) {
headerChars = unlist(strsplit(line, ""));
# Find the first position of non space characters, excluding the first character
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
}
row = .gsa.splitFixedWidth(line, columnStarts);
}
if (length(tableHeader) == 0) {
tableHeader = row;
} else {
tableRows = rbind(tableRows, row);
}
}
}
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
gatkreport = as.list(tableEnv, all.names=TRUE);
}
# Load all GATKReport v1 tables from file
gsa.read.gatkreportv1 <- function(lines) {
#print("loading with optimized v1 reader")
nLines = length(lines)
tableEnv = new.env();
tableName = NA;
tableHeader = c();
tableRows = NULL;
version = "";
rowCount = 0
headerRowCount = -1;
finishTable <- function() {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv);
}
for (line in lines) {
if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
version = "v1.0";
headerRowCount = 0;
}
if ( (headerRowCount %% 2 == 1) && (version == "v1.0") ) {
#print("Trying to start a table with line:");
#print(line);
#Get table header
headerFields = unlist(strsplit(line, ":"));
if (!is.na(tableName)) {
finishTable()
}
tableName = headerFields[3];
tableHeader = c();
tableRows = NULL
rowCount = 0
columnStarts = c();
}
if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) {
headerRowCount = headerRowCount+1;
#print("Header Row count is at:")
#print(headerRowCount);
} else if (!is.na(tableName)) {
if ( version == "v1.0") {
if (length(tableHeader) == 0) {
headerChars = unlist(strsplit(line, ""));
# Find the first position of non space characters, excluding the first character
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
tableRows = matrix(nrow=nLines, ncol=length(columnStarts)+1);
}
row = .gsa.splitFixedWidth(line, columnStarts);
}
if (length(tableHeader) == 0) {
tableHeader = row;
} else if ( nchar(line) > 0 ) {
rowCount = rowCount + 1
tableRows[rowCount,] <- row
}
}
}
if (!is.na(tableName)) {
finishTable()
}
gatkreport = as.list(tableEnv, all.names=TRUE);
}
# Load all GATKReport tables from a file
gsa.read.gatkreport <- function(filename) {
con = file(filename, "r", blocking = TRUE);
lines = readLines(con);
close(con);
tableEnv = new.env();
tableName = NA;
tableHeader = c();
tableRows = c();
version = NA;
for (line in lines) {
if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
headerFields = unlist(strsplit(line, "[[:space:]]+"));
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
}
tableName = headerFields[2];
tableHeader = c();
tableRows = c();
# For differences in versions see
# $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
version = "v0.1";
} else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
version = "v0.2";
columnStarts = c();
}
} else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
# do nothing
} else if (!is.na(tableName)) {
if (version == "v0.1") {
row = unlist(strsplit(line, "[[:space:]]+"));
} else if (version == "v0.2") {
if (length(tableHeader) == 0) {
headerChars = unlist(strsplit(line, ""));
# Find the first position of non space characters, excluding the first character
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
}
row = .gsa.splitFixedWidth(line, columnStarts);
}
if (length(tableHeader) == 0) {
tableHeader = row;
} else {
tableRows = rbind(tableRows, row);
}
}
# get first line
line = lines[1];
if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
gsa.read.gatkreportv1(lines)
}
if (!is.na(tableName)) {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
else if (length(grep("^##:GATKReport.v0", line, ignore.case=TRUE)) > 0) {
gsa.read.gatkreportv0(lines)
}
gatkreport = as.list(tableEnv, all.names=TRUE);
}

View File

@ -0,0 +1,244 @@
library(gplots)
library(ggplot2)
# -------------------------------------------------------
# Utilities for displaying multiple plots per page
# -------------------------------------------------------
distributeGraphRows <- function(graphs, heights = c()) {
# Viewport layout 2 graphs top to bottom with given relative heights
#
#
if (length(heights) == 0) {
heights <- rep.int(1, length(graphs))
}
heights <- heights[!is.na(graphs)]
graphs <- graphs[!is.na(graphs)]
numGraphs <- length(graphs)
Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights)
grid.newpage()
pushViewport(viewport(layout = Layout))
subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
for (i in 1:numGraphs) {
print(graphs[[i]], vp = subplot(i))
}
}
distributeLogGraph <- function(graph, xName) {
continuousGraph <- graph + scale_x_continuous(xName)
logGraph <- graph + scale_x_log10(xName) + opts(title="")
distributeGraphRows(list(continuousGraph, logGraph))
}
distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) {
distributeGraphRows(list(perSampleGraph, distGraph), ratio)
}
removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) {
# Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame
#
# Only keeps the column marked with "all" for each removed column
#
for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) {
if (toRemove %in% colnames(variantEvalDataFrame)) {
variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",]
}
}
variantEvalDataFrame
}
openPDF <- function(outputPDF) {
# Open the outputPDF file with standard dimensions, if outputPDF is not NA
if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
}
closePDF <- function(outputPDF) {
# close the outputPDF file if not NA, and try to compact the PDF if possible
if ( ! is.na(outputPDF) ) {
dev.off()
if (exists("compactPDF")) {
compactPDF(outputPDF)
}
}
}
makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) {
if ( is.null(widths) ) widths <- rep(1, length(ACs))
value = NULL
titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom)
}
.reduceACs <- function(binWidthForAC, ACs) {
# computes data structures necessary to reduce the full range of ACs
#
# binWidthForAC returns the number of upcoming bins that should be merged into
# that AC bin. ACs is a vector of all AC values from 0 to 2N that should be
# merged together
#
# Returns a list containing the reduced ACs starts, their corresponding widths,
# and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
maxAC <- max(ACs)
newACs <- c()
widths <- c()
newACMap <- c()
ac <- 0
while ( ac < maxAC ) {
newACs <- c(newACs, ac)
width <- binWidthForAC(ac)
widths <- c(widths, width)
newACMap <- c(newACMap, rep(ac, width))
ac <- ac + width
}
list(ACs = newACs, widths=widths, newACMap = newACMap)
}
# geometricACs <- function(k, ACs) {
# nBins <- round(k * log10(max(ACs)))
#
# binWidthForAC <- function(ac) {
# max(ceiling(ac / nBins), 1)
# }
#
# return(reduceACs(binWidthForAC, ACs))
# }
reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) {
# map the full range of AC values onto a log linear scale
#
# Reduce the full AC range onto one where the width of each new AC increases at a rate of
# 10^scaleFactor in size with growing AC values. This is primarily useful for accurately
# computing ratios or other quantities by AC that aren't well determined when the AC
# values are very large
#
# Returns a list containing the reduced ACs starts, their corresponding widths,
# and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
maxAC <- max(ACs)
afs <- ACs / maxAC
breaks <- 10^(seq(-4, -1, scaleFactor))
widths <- c()
lastBreak <- 1
for ( i in length(breaks):1 ) {
b <- breaks[i]
width <- sum(afs < lastBreak & afs >= b)
widths <- c(widths, width)
lastBreak <- b
}
widths <- rev(widths)
binWidthForAC <- function(ac) {
af <- ac / maxAC
value = 1
for ( i in length(breaks):1 )
if ( af >= breaks[i] ) {
value = widths[i]
break
}
return(value)
}
return(.reduceACs(binWidthForAC, ACs))
}
.remapACs <- function(remapper, k, df) {
newACs <- remapper(k, df$AC)
n = length(newACs$ACs)
num = rep(0, n)
denom = rep(0, n)
for ( i in 1:dim(df)[1] ) {
rowI = df$AC == i
row = df[rowI,]
newAC = newACs$newACMap[row$AC]
newRowI = newACs$ACs == newAC
num[newRowI] = num[newRowI] + df$num[rowI]
denom[newRowI] = denom[newRowI] + df$denom[rowI]
}
newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths )
newdf
}
compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) {
df = makeRatioDataFrame(ACs, num, denom, 1)
return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df))
}
plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels",
onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T,
moreTitle="", note = NULL) {
metrics$strat = metrics[[requestedStrat]]
otherFacet = "."
id.vars = c("strat", "nobs")
metrics$nobs <- metrics[[nObsField]]
# keep track of the other strat and it's implied facet value
if (! is.null(anotherStrat)) {
id.vars = c(id.vars, anotherStrat)
otherFacet = anotherStrat
}
molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures))
perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable))
# create the title
titleText=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle)
if ( !is.null(note) ) {
titleText=paste(titleText, note, sep="\n")
}
paste(titleText)
title <- opts(title=titleText)
determineFacet <- function(onX) {
if ( onX ) {
paste(otherFacet, "~ variable")
} else {
paste("variable ~", otherFacet)
}
}
sampleFacet = determineFacet(facetVariableOnXPerSample)
distFacet = determineFacet(facetVariableOnXForDist)
if ( requestedStrat == "Sample" ) {
perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
} else { # by AlleleCount
perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")
}
perSampleGraph <- perSampleGraph + ylab("Variable value") + title
perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free")
nValues = length(unique(molten$value))
if (nValues > 2) {
if ( requestedStrat == "Sample" ) {
distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable))
} else {
distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs))
}
distGraph <- distGraph + geom_histogram(aes(y=..ndensity..))
distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..))
distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter"))
scale = "free"
if ( fixHistogramX ) scale = "fixed"
distGraph <- distGraph + facet_grid(distFacet, scales=scale)
distGraph <- distGraph + ylab("Relative frequency")
distGraph <- distGraph + xlab("Variable value (see facet for variable by color)")
distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none")
} else {
distGraph <- NA
}
if ( onSamePage ) {
suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph))
} else {
suppressMessages(print(perSampleGraph))
suppressMessages(print(distGraph + title))
}
}

View File

@ -1,138 +0,0 @@
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
2 * TP / 3 / callable
}
marginalTiTv <- function( nx, titvx, ny, titvy ) {
tvx = nx / (titvx + 1)
tix = nx - tvx
tvy = ny / (titvy + 1)
tiy = ny - tvy
tiz = tix - tiy
tvz = tvx - tvy
return(tiz / tvz)
}
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
knownx = nx * dbx / 100
novelx = nx - knownx
knowny = ny * dby / 100
novely = ny - knowny
knownz = knownx - knowny
novelz = novelx - novely
return(knownz / ( knownz + novelz ) * 100)
}
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
}
normalize <- function(x) {
x / sum(x)
}
normcumsum <- function(x) {
cumsum(normalize(x))
}
cumhist <- function(d, ...) {
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
}
revcumsum <- function(x) {
return(rev(cumsum(rev(x))))
}
phred <- function(x) {
log10(max(x,10^(-9.9)))*-10
}
pOfB <- function(b, B, Q) {
#print(paste(b, B, Q))
p = 1 - 10^(-Q/10)
if ( b == B )
return(p)
else
return(1 - p)
}
pOfG <- function(bs, qs, G) {
a1 = G[1]
a2 = G[2]
log10p = 0
for ( i in 1:length(bs) ) {
b = bs[i]
q = qs[i]
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
log10p = log10p + log10(p1)
}
return(log10p)
}
pOfGs <- function(nAs, nBs, Q) {
bs = c(rep("a", nAs), rep("t", nBs))
qs = rep(Q, nAs + nBs)
G1 = c("a", "a")
G2 = c("a", "t")
G3 = c("t", "t")
log10p1 = pOfG(bs, qs, G1)
log10p2 = pOfG(bs, qs, G2)
log10p3 = pOfG(bs, qs, G3)
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
}
QsampleExpected <- function(depth, Q) {
weightedAvg = 0
for ( d in 1:(depth*3) ) {
Qsample = 0
pOfD = dpois(d, depth)
for ( nBs in 0:d ) {
pOfnB = dbinom(nBs, d, 0.5)
nAs = d - nBs
Qsample = pOfGs(nAs, nBs, Q)$Qsample
#Qsample = 1
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
}
}
return(weightedAvg)
}
plotQsamples <- function(depths, Qs, Qmax) {
cols = rainbow(length(Qs))
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
for ( i in 1:length(Qs) ) {
Q = Qs[i]
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
points(depths, y, col=cols[i], type="b")
}
legend("topleft", paste("Q", Qs), fill=cols)
}
pCallHetGivenDepth <- function(depth, nallelesToCall) {
depths = 0:(2*depth)
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
dpois(depths,depth)*(1-pNoAllelesToCall)
}
pCallHets <- function(depth, nallelesToCall) {
sum(pCallHetGivenDepth(depth,nallelesToCall))
}
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
1-(1-pCallHets(depth,nallelesToCall))^nsamples
}

View File

@ -83,12 +83,12 @@ public final class IntervalBinding<T extends Feature> {
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
if ( codec instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
try {
FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
AsciiLineReader lineReader = new AsciiLineReader(fis);
final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
final AsciiLineReader lineReader = new AsciiLineReader(fis);
codec.readHeader(lineReader);
String line = lineReader.readLine();
while ( line != null ) {

View File

@ -103,21 +103,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
argumentSources.add(walker);
Collection<RMDTriplet> rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
// todo: remove me when the old style system is removed
if ( getArgumentCollection().RODBindings.size() > 0 ) {
logger.warn("################################################################################");
logger.warn("################################################################################");
logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
logger.warn("Please use arguments defined by each specific walker instead.");
for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
}
logger.warn("################################################################################");
logger.warn("################################################################################");
System.exit(1);
}
engine.setReferenceMetaDataFiles(rodBindings);
for (ReadFilter filter: filters) {

View File

@ -100,10 +100,11 @@ public class CommandLineGATK extends CommandLineExecutable {
} catch(PicardException e) {
// TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions?
exitSystemWithError(e);
}
catch (SAMException e) {
} catch (SAMException e) {
checkForTooManyOpenFilesProblem(e.getMessage());
exitSystemWithSamError(e);
} catch (OutOfMemoryError e) {
exitSystemWithUserError(new UserException.NotEnoughMemory());
} catch (Throwable t) {
checkForTooManyOpenFilesProblem(t.getMessage());
exitSystemWithError(t);

View File

@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
/**
* Actually run the GATK with the specified walker.
@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();
@ -356,10 +358,6 @@ public class GenomeAnalysisEngine {
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
protected boolean generateExtendedEvents() {
return walker.generateExtendedEvents();
}
protected boolean includeReadsWithDeletionAtLoci() {
return walker.includeReadsWithDeletionAtLoci();
}
@ -613,7 +611,7 @@ public class GenomeAnalysisEngine {
*/
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
for ( IntervalBinding intervalBinding : argList ) {
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
@ -766,7 +764,6 @@ public class GenomeAnalysisEngine {
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters,
includeReadsWithDeletionAtLoci(),
generateExtendedEvents(),
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,

View File

@ -36,7 +36,6 @@ public class ReadProperties {
private final Collection<ReadFilter> supplementalFilters;
private final boolean includeReadsWithDeletionAtLoci;
private final boolean useOriginalBaseQualities;
private final boolean generateExtendedEvents;
private final BAQ.CalculationMode cmode;
private final BAQ.QualityMode qmode;
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
@ -52,16 +51,9 @@ public class ReadProperties {
return includeReadsWithDeletionAtLoci;
}
/**
* Return true if the walker wants to see additional piles of "extended" events (indels). An indel is associated,
* by convention, with the reference base immediately preceding the insertion/deletion, and if this flag is set
* to 'true', any locus with an indel associated with it will cause exactly two subsequent calls to walker's map(): first call
* will be made with a "conventional" base pileup, the next call will be made with a pileup of extended (indel/noevent)
* events.
* @return
*/
@Deprecated
public boolean generateExtendedEvents() {
return generateExtendedEvents;
return false;
}
/**
@ -144,9 +136,6 @@ public class ReadProperties {
* @param downsamplingMethod Method for downsampling reads at a given locus.
* @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply.
* @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with
* a pile of indel/noevent extended events at every locus with at least one indel associated with it
* (in addition to a "regular" call to map() at this locus performed with base pileup)
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
@ -163,7 +152,6 @@ public class ReadProperties {
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
@ -176,7 +164,6 @@ public class ReadProperties {
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
this.supplementalFilters = supplementalFilters;
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
this.generateExtendedEvents = generateExtendedEvents;
this.useOriginalBaseQualities = useOriginalBaseQualities;
this.cmode = cmode;
this.qmode = qmode;

View File

@ -107,11 +107,6 @@ public class GATKArgumentCollection {
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
@Deprecated
@Hidden
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :<name>,<type> <file>", required = false)
public ArrayList<String> RODBindings = new ArrayList<String>();
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@ -198,6 +193,16 @@ public class GATKArgumentCollection {
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
/**
* Turns on the base quantization module. It requires a recalibration report (-BQSR).
*
* A value of 0 here means "do not quantize".
* Any value greater than zero will be used to recalculate the quantization using this many levels.
* Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
*/
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
public int quantizationLevels = -1;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;

View File

@ -98,6 +98,7 @@ public class AlignmentContext implements HasGenomeLocation {
* only base pileup.
* @return
*/
@Deprecated
public ReadBackedExtendedEventPileup getExtendedEventPileup() {
if(!hasExtendedEventPileup())
throw new ReviewedStingException("No extended event pileup is present.");
@ -115,6 +116,7 @@ public class AlignmentContext implements HasGenomeLocation {
*
* @return
*/
@Deprecated
public boolean hasExtendedEventPileup() { return basePileup instanceof ReadBackedExtendedEventPileup; }
/**

View File

@ -191,6 +191,16 @@ public class ReferenceContext {
return basesCache;
}
/**
* All the bases in the window from the current base forward to the end of the window.
*/
public byte[] getForwardBases() {
final byte[] bases = getBases();
final int mid = locus.getStart() - window.getStart();
// todo -- warning of performance problem, especially if this is called over and over
return new String(bases).substring(mid).getBytes();
}
@Deprecated
public char getBaseAsChar() {
return (char)getBase();

View File

@ -28,6 +28,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import java.util.List;
import java.util.NoSuchElementException;
@ -154,8 +155,8 @@ class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
}
}
else {
// Found an unmapped read. We're done.
if(candidateRead.getReadUnmappedFlag()) {
// Found a -L UNMAPPED read. NOTE: this is different than just being flagged as unmapped! We're done.
if(AlignmentUtils.isReadGenomeLocUnmapped(candidateRead)) {
nextRead = candidateRead;
break;
}

View File

@ -167,7 +167,6 @@ public class SAMDataSource {
null,
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false,
false);
}
@ -185,8 +184,7 @@ public class SAMDataSource {
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents) {
boolean includeReadsWithDeletionAtLoci) {
this( samFiles,
threadAllocation,
numFileHandles,
@ -198,7 +196,6 @@ public class SAMDataSource {
exclusionList,
supplementalFilters,
includeReadsWithDeletionAtLoci,
generateExtendedEvents,
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
@ -215,9 +212,6 @@ public class SAMDataSource {
* @param downsamplingMethod Method for downsampling reads at a given locus.
* @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply.
* @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with
* a pile of indel/noevent extended events at every locus with at least one indel associated with it
* (in addition to a "regular" call to map() at this locus performed with base pileup)
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
@ -235,7 +229,6 @@ public class SAMDataSource {
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
boolean generateExtendedEvents,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
@ -308,7 +301,6 @@ public class SAMDataSource {
exclusionList,
supplementalFilters,
includeReadsWithDeletionAtLoci,
generateExtendedEvents,
cmode,
qmode,
refReader,

View File

@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) {
Cigar c = rec.getCigar();
boolean lastElementWasIndel = false;
for ( CigarElement ce : c.getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
if ( lastElementWasIndel )
return true;
lastElementWasIndel = true;
} else {
lastElementWasIndel = false;
boolean previousElementWasIndel = false;
CigarOperator lastOp = c.getCigarElement(0).getOperator();
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
return true;
for (CigarElement ce : c.getCigarElements()) {
CigarOperator op = ce.getOperator();
if (op == CigarOperator.D || op == CigarOperator.I) {
if (previousElementWasIndel)
return true; // filter out reads with adjacent I/D
previousElementWasIndel = true;
}
else // this is a regular base (match/mismatch/hard or soft clip)
previousElementWasIndel = false; // reset the previous element
lastOp = op;
}
return false;
return lastOp == CigarOperator.D;
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2009 The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -12,7 +12,6 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@ -99,8 +98,13 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Create a new stub given the requested file.
*
* @param engine engine.
* @param genotypeFile file to (ultimately) create.
* @param isCompressed should we compress the output stream?
* @param argumentSources sources.
* @param skipWritingHeader skip writing header.
* @param doNotWriteGenotypes do not write genotypes.
*/
public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) {
this.engine = engine;
@ -114,8 +118,13 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Create a new stub given the requested file.
*
* @param engine engine.
* @param genotypeStream stream to (ultimately) write.
* @param isCompressed should we compress the output stream?
* @param argumentSources sources.
* @param skipWritingHeader skip writing header.
* @param doNotWriteGenotypes do not write genotypes.
*/
public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) {
this.engine = engine;
@ -154,7 +163,7 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Gets the master sequence dictionary from the engine associated with this stub
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
* @return
* @return the master sequence dictionary from the engine associated with this stub
*/
public SAMSequenceDictionary getMasterSequenceDictionary() {
return engine.getMasterSequenceDictionary();
@ -188,22 +197,25 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
vcfHeader = header;
// Check for the command-line argument header line. If not present, add it in.
if ( !skipWritingHeader ) {
VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine();
boolean foundCommandLineHeaderLine = false;
for (VCFHeaderLine line: vcfHeader.getMetaData()) {
if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) )
foundCommandLineHeaderLine = true;
if (!skipWritingHeader && header.isWriteEngineHeaders()) {
if (header.isWriteCommandLine()) {
VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine();
boolean foundCommandLineHeaderLine = false;
for (VCFHeaderLine line: vcfHeader.getMetaData()) {
if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) )
foundCommandLineHeaderLine = true;
}
if ( !foundCommandLineHeaderLine )
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
}
if ( !foundCommandLineHeaderLine )
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
// also put in the reference contig header lines
String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName());
for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() )
vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly));
vcfHeader.addMetaDataLine(new VCFHeaderLine("reference", "file://" + engine.getArguments().referenceFile.getAbsolutePath()));
vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath()));
}
outputTracker.getStorage(this).writeHeader(vcfHeader);
@ -225,7 +237,7 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
/**
* Gets a string representation of this object.
* @return
* @return a string representation of this object.
*/
@Override
public String toString() {
@ -247,20 +259,20 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
val = String.format("<ID=%s,length=%d,assembly=%s>", contig.getSequenceName(), contig.getSequenceLength(), assembly);
else
val = String.format("<ID=%s,length=%d>", contig.getSequenceName(), contig.getSequenceLength());
return new VCFHeaderLine("contig", val);
return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val);
}
private String getReferenceAssembly(String refPath) {
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
String assembly = null;
if ( refPath.indexOf("b37") != -1 || refPath.indexOf("v37") != -1 )
if (refPath.contains("b37") || refPath.contains("v37"))
assembly = "b37";
else if ( refPath.indexOf("b36") != -1 )
else if (refPath.contains("b36"))
assembly = "b36";
else if ( refPath.indexOf("hg18") != -1 )
else if (refPath.contains("hg18"))
assembly = "hg18";
else if ( refPath.indexOf("hg19") != -1 )
else if (refPath.contains("hg19"))
assembly = "hg19";
return assembly;
}
}
}

View File

@ -179,6 +179,11 @@ public class LocusIteratorByState extends LocusIterator {
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
}
public CigarElement peekBackwardOnGenome() {
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
@ -194,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator {
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
@ -231,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator {
// we see insertions only once, when we step right onto them; the position on the read is scrolled
// past the insertion right after that
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
eventLength = curElement.getLength();
eventStart = readOffset;
@ -244,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator {
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
if (generateExtendedEvents) {
if (cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
eventLength = curElement.getLength();
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
eventStart = readOffset;
@ -401,24 +406,24 @@ public class LocusIteratorByState extends LocusIterator {
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next();
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
final int eventLength = state.getEventLength();
if (op == CigarOperator.N) // N's are never added to any pileup
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
size++;
ExtendedEventPileupElement pileupElement;
if (state.getEventBases() == null) { // Deletion event
if (state.getEventBases() == null) { // Deletion event
nDeletions++;
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
}
else { // Insertion event
else { // Insertion event
nInsertions++;
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
}
@ -442,10 +447,10 @@ public class LocusIteratorByState extends LocusIterator {
if (indelPile.size() != 0)
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
}
hasExtendedEvents = false; // we are done with extended events prior to current ref base
hasExtendedEvents = false; // we are done with extended events prior to current ref base
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
}
else { // this is a regular event pileup (not extended)
else { // this is a regular event pileup (not extended)
GenomeLoc location = getLocation();
Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
boolean hasBeenSampled = false;
@ -454,27 +459,34 @@ public class LocusIteratorByState extends LocusIterator {
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
size = 0; // number of elements in this sample's pileup
nDeletions = 0; // number of deletions in this sample's pileup
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
size = 0; // number of elements in this sample's pileup
nDeletions = 0; // number of deletions in this sample's pileup
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarOperator nextOp = nextElement.getOperator();
final int readOffset = state.getReadOffset(); // the base offset on this read
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
int nextElementLength = nextElement.getLength();
if (op == CigarOperator.N) // N's are never added to any pileup
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (op == CigarOperator.D) {
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
null,nextOp == CigarOperator.D? nextElementLength:-1));
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
if (read.getMappingQuality() == 0)
@ -484,11 +496,10 @@ public class LocusIteratorByState extends LocusIterator {
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I) {
if (nextOp == CigarOperator.I)
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
}
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
insertedBaseString,nextElementLength));
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;

View File

@ -47,6 +47,14 @@ public class RefMetaDataTracker {
//
// ------------------------------------------------------------------------------------------
/**
* Only for testing -- not accesssible in any other context
*/
public RefMetaDataTracker() {
ref = null;
map = Collections.emptyMap();
}
public RefMetaDataTracker(final Collection<RODRecordList> allBindings, final ReferenceContext ref) {
this.ref = ref;
@ -418,7 +426,7 @@ public class RefMetaDataTracker {
* with the current site as a RODRecordList List object. If no data track with specified name is available,
* returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up
* with track name set to 'name' and location set to null; otherwise the wrapper object will have name and
* location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution,
* location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution,
* defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise:
* for instance, on locus traversal, location is usually expected to be a single base we are currently looking at,
* regardless of the presence of "extended" RODs overlapping with that location).

View File

@ -132,7 +132,7 @@ public class FeatureManager {
}
/**
* Return the FeatureDescriptor with getName().equals(name)
* Return the FeatureDescriptor with getID().equals(name)
*
* @param name
* @return A FeatureDescriptor or null if none is found

View File

@ -1,8 +1,32 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.*;
import java.util.Collection;
@ -13,8 +37,12 @@ import java.util.TreeMap;
* Container class for GATK report tables
*/
public class GATKReport {
public static final String GATKREPORT_HEADER_PREFIX = "##:GATKReport.v";
private TreeMap<String, GATKReportTable> tables = new TreeMap<String, GATKReportTable>();
public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport.";
public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_0;
private static final String SEPARATOR = ":";
private GATKReportVersion version = LATEST_REPORT_VERSION;
private final TreeMap<String, GATKReportTable> tables = new TreeMap<String, GATKReportTable>();
/**
* Create a new, empty GATKReport.
@ -24,7 +52,8 @@ public class GATKReport {
/**
* Create a new GATKReport with the contents of a GATKReport on disk.
* @param filename the path to the file to load
*
* @param filename the path to the file to load
*/
public GATKReport(String filename) {
this(new File(filename));
@ -32,114 +61,96 @@ public class GATKReport {
/**
* Create a new GATKReport with the contents of a GATKReport on disk.
* @param file the file to load
*
* @param file the file to load
*/
public GATKReport(File file) {
loadReport(file);
}
/**
* Load a GATKReport file from disk
* @param file the file to load
* Create a new GATK report from GATK report tables
* @param tables Any number of tables that you want ot add to the report
*/
private void loadReport(File file) {
try {
BufferedReader reader = new BufferedReader(new FileReader(file));
GATKReportTable table = null;
String[] header = null;
int id = 0;
GATKReportVersion version = null;
List<Integer> columnStarts = null;
String line;
while ( (line = reader.readLine()) != null ) {
if (line.startsWith(GATKREPORT_HEADER_PREFIX)) {
version = GATKReportVersion.fromHeader(line);
line = line.replaceFirst("##:GATKReport." + version.versionString + " ", "");
String[] pieces = line.split(" : ");
String tableName = pieces[0];
String tableDesc = pieces[1];
addTable(tableName, tableDesc);
table = getTable(tableName);
table.setVersion(version);
header = null;
columnStarts = null;
} else if ( line.trim().isEmpty() ) {
// do nothing
} else {
if (table != null) {
String[] splitLine;
switch (version) {
case V0_1:
splitLine = TextFormattingUtils.splitWhiteSpace(line);
break;
case V0_2:
if (header == null) {
columnStarts = TextFormattingUtils.getWordStarts(line);
}
splitLine = TextFormattingUtils.splitFixedWidth(line, columnStarts);
break;
default:
throw new ReviewedStingException("GATK report version parsing not implemented for: " + line);
}
if (header == null) {
header = splitLine;
table.addPrimaryKey("id", false);
for ( String columnName : header ) {
table.addColumn(columnName, "");
}
id = 0;
} else {
for (int columnIndex = 0; columnIndex < header.length; columnIndex++) {
table.set(id, header[columnIndex], splitLine[columnIndex]);
}
id++;
}
}
}
}
} catch (FileNotFoundException e) {
throw new StingException("Cannot read GATKReport: " + e);
} catch (IOException e) {
throw new StingException("Cannot read GATKReport: " + e);
}
public GATKReport(GATKReportTable... tables) {
for( GATKReportTable table: tables)
addTable(table);
}
/**
* Add a new table to the collection
* Load a GATKReport file from disk
*
* @param tableName the name of the table
* @param tableDescription the description of the table
* @param file the file to load
*/
private void loadReport(File file) {
BufferedReader reader;
String reportHeader;
try {
reader = new BufferedReader(new FileReader(file));
reportHeader = reader.readLine();
} catch (FileNotFoundException e) {
throw new ReviewedStingException("Could not open file : " + file);
} catch (IOException e) {
throw new ReviewedStingException("Could not read file : " + file);
}
// Read the first line for the version and number of tables.
version = GATKReportVersion.fromHeader(reportHeader);
if (version.equals(GATKReportVersion.V0_1) ||
version.equals(GATKReportVersion.V0_2))
throw new UserException("The GATK no longer supports reading legacy GATK Reports. Please use v1.0 or newer.");
int nTables = Integer.parseInt(reportHeader.split(":")[2]);
// Read each tables according ot the number of tables
for (int i = 0; i < nTables; i++) {
addTable(new GATKReportTable(reader, version));
}
}
/**
* Add a new, empty table to the report
*
* @param tableName the name of the table
* @param tableDescription the description of the table
*/
public void addTable(String tableName, String tableDescription) {
addTable(tableName, tableDescription, true);
}
/**
* Add a new, empty table to the report
*
* @param tableName the name of the table
* @param tableDescription the description of the table
* @param sortByPrimaryKey whether to sort the rows by the primary key
*/
public void addTable(String tableName, String tableDescription, boolean sortByPrimaryKey) {
GATKReportTable table = new GATKReportTable(tableName, tableDescription, sortByPrimaryKey);
tables.put(tableName, table);
}
/**
* Adds a table, empty or populated, to the report
*
* @param table the table to add
*/
public void addTable(GATKReportTable table) {
tables.put(table.getTableName(), table);
}
public void addTables(List<GATKReportTable> gatkReportTables) {
for (GATKReportTable table : gatkReportTables)
addTable(table);
}
/**
* Return true if table with a given name exists
*
* @param tableName the name of the table
* @param tableName the name of the table
* @return true if the table exists, false otherwise
*/
public boolean hasTable(String tableName) {
@ -149,8 +160,8 @@ public class GATKReport {
/**
* Return a table with a given name
*
* @param tableName the name of the table
* @return the table object
* @param tableName the name of the table
* @return the table object
*/
public GATKReportTable getTable(String tableName) {
GATKReportTable table = tables.get(tableName);
@ -162,17 +173,164 @@ public class GATKReport {
/**
* Print all tables contained within this container to a PrintStream
*
* @param out the PrintStream to which the tables should be written
* @param out the PrintStream to which the tables should be written
*/
public void print(PrintStream out) {
for (GATKReportTable table : tables.values()) {
if (table.getNumRows() > 0) {
table.write(out);
}
}
out.println(GATKREPORT_HEADER_PREFIX + getVersion().toString() + SEPARATOR + getTables().size());
for (GATKReportTable table : tables.values())
table.write(out);
}
public Collection<GATKReportTable> getTables() {
return tables.values();
}
/**
* This is the main function is charge of gathering the reports. It checks that the reports are compatible and then
* calls the table atheirng functions.
*
* @param input another GATKReport of the same format
*/
public void combineWith(GATKReport input) {
if (!this.isSameFormat(input)) {
throw new ReviewedStingException("Failed to combine GATKReport, format doesn't match!");
}
for (String tableName : input.tables.keySet()) {
tables.get(tableName).combineWith(input.getTable(tableName));
}
}
public GATKReportVersion getVersion() {
return version;
}
/**
* Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything
* in between. This does not check if the data inside is the same. This is the check to see if the two reports are
* gatherable or reduceable.
*
* @param report another GATK report
* @return true if the the reports are gatherable
*/
public boolean isSameFormat(GATKReport report) {
if (!version.equals(report.version)) {
return false;
}
if (!tables.keySet().equals(report.tables.keySet())) {
return false;
}
for (String tableName : tables.keySet()) {
if (!getTable(tableName).isSameFormat(report.getTable(tableName)))
return false;
}
return true;
}
/**
* Checks that the reports are exactly the same.
*
* @param report another GATK report
* @return true if all field in the reports, tables, and columns are equal.
*/
public boolean equals(GATKReport report) {
if (!version.equals(report.version)) {
return false;
}
if (!tables.keySet().equals(report.tables.keySet())) {
return false;
}
for (String tableName : tables.keySet()) {
if (!getTable(tableName).equals(report.getTable(tableName)))
return false;
}
return true;
}
/**
* The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need
* the advanced functionality of a full GATK Report.
* <p/>
* A simple GATK Report consists of:
* <p/>
* - A single table
* - No primary key ( it is hidden )
* <p/>
* Optional:
* - Only untyped columns. As long as the data is an Object, it will be accepted.
* - Default column values being empty strings.
* <p/>
* Limitations:
* <p/>
* - A simple GATK report cannot contain multiple tables.
* - It cannot contain typed columns, which prevents arithmetic gathering.
*
* @param tableName The name of your simple GATK report table
* @param columns The names of the columns in your table
* @return a simplified GATK report
*/
public static GATKReport newSimpleReport(String tableName, String... columns) {
GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report");
table.addPrimaryKey("id", false);
for (String column : columns) {
table.addColumn(column, "");
}
GATKReport output = new GATKReport();
output.addTable(table);
return output;
}
/**
* This method provides an efficient way to populate a simplified GATK report. This method will only work on reports
* that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information.
*
* @param values the row of data to be added to the table.
* Note: the number of arguments must match the columns in the table.
*/
public void addRow(Object... values) {
// Must be a simplified GATK Report
if (isSimpleReport()) {
GATKReportTable table = tables.firstEntry().getValue();
if (table.getColumns().size() != values.length) {
throw new StingException("The number of arguments in addRow() must match the number of columns in the table");
}
int counter = table.getNumRows() + 1;
int i = 0;
for (String columnName : table.getColumns().keySet()) {
table.set(counter, columnName, values[i]);
i++;
}
} else {
throw new StingException("Cannot add a Row to a non-Simplified GATK Report");
}
}
/**
* Checks if the GATK report qualifies as a "simple" GATK report
*
* @return true is the report is a simplified GATK report
*/
private boolean isSimpleReport() {
if (tables.size() != 1)
return false;
GATKReportTable table = tables.firstEntry().getValue();
if (!table.getPrimaryKeyName().equals("id"))
return false;
return true;
}
}

View File

@ -1,38 +1,78 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.report;
import org.apache.commons.lang.math.NumberUtils;
import java.util.*;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashMap;
/**
* Holds values for a column in a GATK report table
*/
public class GATKReportColumn extends TreeMap<Object, Object> {
public class GATKReportColumn extends LinkedHashMap<Object, Object> {
final private String columnName;
final private Object defaultValue;
final private String format;
final private boolean display;
final private GATKReportDataType dataType;
private GATKReportColumnFormat columnFormat;
private GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; // default alignment is to the right unless values added ask for a left alignment
private int maxWidth = 0;
/**
* Construct the column object, specifying the column name, default value, and whether or not the column should be displayed
* Construct the column object, specifying the column name, default value, whether or not the column should be
* displayed, and the format string. This cannot be null.
*
* @param columnName the name of the column
* @param defaultValue the default value of the column
* @param display if true, the column will be displayed in the final output
* @param format format string
* @param columnName the name of the column
* @param defaultValue the default value of the column
* @param display if true, the column will be displayed in the final output
* @param format format string
*/
public GATKReportColumn(String columnName, Object defaultValue, boolean display, String format) {
this.columnName = columnName;
this.defaultValue = defaultValue;
this.maxWidth = columnName.length();
this.display = display;
this.format = format == null ? null : (format.equals("") ? null : format);
if ( format.equals("") ) {
this.format = "%s";
this.dataType = GATKReportDataType.Unknown;
this.defaultValue = (defaultValue != null) ? defaultValue : "";
}
else {
this.format = format;
this.dataType = GATKReportDataType.fromFormatString(format);
this.defaultValue = (defaultValue != null) ? defaultValue : dataType.getDefaultValue();
}
}
/**
* Initialize an element in the column with a default value
*
* @param primaryKey the primary key position in the column that should be set
* @param primaryKey the primary key position in the column that should be set
*/
public void initialize(Object primaryKey) {
this.put(primaryKey, defaultValue);
@ -40,11 +80,12 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
/**
* Return an object from the column, but if it doesn't exist, return the default value. This is useful when writing
* tables, as the table gets written properly without having to waste storage for the unset elements (usually the zero
* tables, as the table gets written properly without having to waste storage for the unset elements (usually the
* zero
* values) in the table.
*
* @param primaryKey the primary key position in the column that should be retrieved
* @return the value at the specified position in the column, or the default value if the element is not set
* @param primaryKey the primary key position in the column that should be retrieved
* @return the value at the specified position in the column, or the default value if the element is not set
*/
private Object getWithoutSideEffects(Object primaryKey) {
if (!this.containsKey(primaryKey)) {
@ -57,8 +98,8 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
/**
* Return an object from the column, but if it doesn't exist, return the default value.
*
* @param primaryKey the primary key position in the column that should be retrieved
* @return the string value at the specified position in the column, or the default value if the element is not set
* @param primaryKey the primary key position in the column that should be retrieved
* @return the string value at the specified position in the column, or the default value if the element is not set
*/
public String getStringValue(Object primaryKey) {
return formatValue(getWithoutSideEffects(primaryKey));
@ -68,38 +109,24 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
* Return the displayable property of the column. If true, the column will be displayed in the final output.
* If not, printing will be suppressed for the contents of the table.
*
* @return true if the column will be displayed, false if otherwise
* @return true if the column will be displayed, false if otherwise
*/
public boolean isDisplayable() {
return display;
}
/**
* Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed width.
* Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed
* width.
*
* @return the format string for this column
*/
public GATKReportColumnFormat getColumnFormat() {
int maxWidth = columnName.length();
GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT;
if (columnFormat != null)
return columnFormat;
for (Object obj : this.values()) {
if (obj != null) {
String formatted = formatValue(obj);
int width = formatted.length();
if (width > maxWidth) {
maxWidth = width;
}
if (alignment == GATKReportColumnFormat.Alignment.RIGHT) {
if (!isRightAlign(formatted)) {
alignment = GATKReportColumnFormat.Alignment.LEFT;
}
}
}
}
return new GATKReportColumnFormat(maxWidth, alignment);
columnFormat = new GATKReportColumnFormat(maxWidth, alignment);
return columnFormat;
}
private static final Collection<String> RIGHT_ALIGN_STRINGS = Arrays.asList(
@ -112,15 +139,17 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
/**
* Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes
* the spaces mean that the value is already padded.
*
* @param value to check
* @return true if the value is a right alignable
*/
protected static boolean isRightAlign(String value) {
return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value);
return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim());
}
/**
* Returns a string version of the values.
*
* @param obj The object to convert to a string
* @return The string representation of the column
*/
@ -128,19 +157,76 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
String value;
if (obj == null) {
value = "null";
} else if ( format != null ) {
value = String.format(format, obj);
} else if (obj instanceof Float) {
value = String.format("%.8f", (Float) obj);
} else if (obj instanceof Double) {
value = String.format("%.8f", (Double) obj);
} else {
value = obj.toString();
}
else if ( dataType.equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) {
value = String.format("%.8f", obj);
}
else
value = String.format(format, obj);
return value;
}
public GATKReportDataType getDataType() {
return dataType;
}
public boolean isSameFormat(GATKReportColumn that) {
return (dataType.equals(that.dataType) &&
columnName.equals(that.columnName) &&
display == that.display &&
format.equals(that.format) &&
defaultValue.equals(that.defaultValue) );
}
boolean equals(GATKReportColumn that) {
if ( !this.keySet().equals(that.keySet()) ) {
return false;
}
for (Object key : keySet()) {
Object ValueA = this.get(key);
Object ValueB = that.get(key);
//if the value is not equal, (use data type to get the right comparison)
if (!dataType.isEqual(ValueA, ValueB)) {
return false;
}
}
return true;
}
public String getColumnName() {
return columnName;
}
public String getFormat() {
if ( dataType.equals(GATKReportDataType.Unknown) ) {
return "";
}
else
return format;
}
@Override
public Object put(Object key, Object value) {
if (value != null) {
String formatted = formatValue(value);
if (!formatted.equals("")) {
updateMaxWidth(formatted);
updateFormat(formatted);
}
}
return super.put(key, value);
}
private void updateMaxWidth(String formatted) {
maxWidth = Math.max(formatted.length(), maxWidth);
}
private void updateFormat(String formatted) {
if (alignment == GATKReportColumnFormat.Alignment.RIGHT)
alignment = isRightAlign(formatted) ? GATKReportColumnFormat.Alignment.RIGHT : GATKReportColumnFormat.Alignment.LEFT;
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011, The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -29,8 +29,8 @@ package org.broadinstitute.sting.gatk.report;
*/
public class GATKReportColumnFormat {
public static enum Alignment { LEFT, RIGHT }
public int width;
public Alignment alignment;
private final int width;
private final Alignment alignment;
public GATKReportColumnFormat(int width, Alignment alignment) {
this.width = width;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011, The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -24,7 +24,7 @@
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -32,10 +32,11 @@ import java.util.*;
* Tracks a linked list of GATKReportColumn in order by name.
*/
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
private List<String> columnNames = new ArrayList<String>();
private final List<String> columnNames = new ArrayList<String>();
/**
* Returns the column by index
*
* @param i the index
* @return The column
*/
@ -44,9 +45,12 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> i
}
@Override
public GATKReportColumn remove(Object key) {
columnNames.remove(key);
return super.remove(key);
public GATKReportColumn remove(Object columnName) {
if ( !(columnName instanceof String) ) {
throw new ReviewedStingException("The column name must be a String!");
}
columnNames.remove(columnName.toString());
return super.remove(columnName);
}
@Override
@ -59,9 +63,44 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> i
public Iterator<GATKReportColumn> iterator() {
return new Iterator<GATKReportColumn>() {
int offset = 0;
public boolean hasNext() { return offset < columnNames.size() ; }
public GATKReportColumn next() { return getByIndex(offset++); }
public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); }
public boolean hasNext() {
return offset < columnNames.size();
}
public GATKReportColumn next() {
return getByIndex(offset++);
}
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator");
}
};
}
public boolean isSameFormat(GATKReportColumns that) {
if (!columnNames.equals(that.columnNames)) {
return false;
}
for (String columnName : columnNames) {
if (!this.get(columnName).isSameFormat(that.get(columnName))) {
return false;
}
}
return true;
}
boolean equals(GATKReportColumns that) {
for (Map.Entry<String, GATKReportColumn> pair : entrySet()) {
// Make sure that every column is the same, we know that the # of columns
// is the same from isSameFormat()
String key = pair.getKey();
if (!get(key).equals(that.get(key))) {
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,235 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.report;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Map;
/**
* The gatherable data types acceptable in a GATK report column.
*/
public enum GATKReportDataType {
/**
* The null type should not be used.
*/
Null("Null"),
/**
* The default value when a format string is not present
*/
Unknown("Unknown"),
/**
* Used for boolean values. Will display as true or false in the table.
*/
Boolean("%[Bb]"),
/**
* Used for char values. Will display as a char so use printable values!
*/
Character("%[Cc]"),
/**
* Used for float and double values. Will output a decimal with format %.8f unless otherwise specified.
*/
Decimal("%.*[EeFf]"),
/**
* Used for int, byte, short, and long values. Will display the full number by default.
*/
Integer("%[Dd]"),
/**
* Used for string values. Displays the string itself.
*/
String("%[Ss]");
private final String dataTypeString;
private GATKReportDataType(String dataTypeString) {
this.dataTypeString = dataTypeString;
}
private static final Map<String, GATKReportDataType> lookup = new HashMap<String, GATKReportDataType>();
static {
for (GATKReportDataType s : EnumSet.allOf(GATKReportDataType.class))
lookup.put(s.dataTypeString, s);
}
@Override
public String toString() {
return this.dataTypeString;
}
/**
* Returns a GATK report data type from the Object specified. It looks through the list of acceptable classes and
* returns the appropriate data type.
*
* @param object the object ot derive the data type from
* @return the appropriate data type
*/
public static GATKReportDataType fromObject(Object object) {
GATKReportDataType value;
if (object instanceof Boolean) {
value = GATKReportDataType.Boolean;
} else if (object instanceof Character) {
value = GATKReportDataType.Character;
} else if (object instanceof Float ||
object instanceof Double) {
value = GATKReportDataType.Decimal;
} else if (object instanceof Integer ||
object instanceof Long ||
object instanceof Short ||
object instanceof Byte ) {
value = GATKReportDataType.Integer;
} else if (object instanceof String) {
value = GATKReportDataType.String;
} else {
value = GATKReportDataType.Unknown;
//throw new UserException("GATKReport could not convert the data object into a GATKReportDataType. Acceptable data objects are found in the documentation.");
}
return value;
}
/**
* Returns a GATK report data type from the format string specified. It uses regex matching from the enumerated
* Strings.
*
* @param format the format string to derive the data type from
* @return the appropriate data type
*/
public static GATKReportDataType fromFormatString(String format) {
if (format.equals(""))
return Unknown;
for (GATKReportDataType type : lookup.values()) {
if (format.matches(type.toString()) )
return type;
}
return Unknown;
}
/**
* Returns the default value of the data type. It returns an object that matches the class of the data type.
*
* @return an object that matches the data type
*/
public Object getDefaultValue() {
switch (this) {
case Decimal:
return 0.0D;
case Boolean:
return false;
case Character:
return '0';
case Integer:
return 0L;
case String:
return "";
default:
return null;
}
}
/**
* Checks if the two objects are equal using the appropriate test form the data types.
*
* @param a an object
* @param b another object to check if equal
* @return true - the objects are equal, false - the objects are nto equal
*/
public boolean isEqual(Object a, Object b) {
switch (this) {
case Null:
return true;
case Decimal:
case Boolean:
case Integer:
return a.toString().equals(b.toString());
case Character:
case String:
default:
return a.equals(b);
}
}
/**
* Converts an input String to the appropriate type using the data type. Used for parsing loading a GATK report from
* file.
*
* @param obj The input string
* @return an object that matches the data type.
*/
Object Parse(Object obj) {
if (obj instanceof String) {
String str = obj.toString();
switch (this) {
case Decimal:
return Double.parseDouble(str);
case Boolean:
return java.lang.Boolean.parseBoolean(str);
case Integer:
return Long.parseLong(str);
case String:
return str;
case Character:
return str.toCharArray()[0];
default:
return str;
}
} else
return null;
}
/**
* Returns a format string version of the value according to the data type.
*
* @return The printf string representation of the object according to data type.
*/
public String getDefaultFormatString() {
switch (this) {
case Decimal:
return "%.8f";
case Boolean:
return "%b";
case Integer:
return "%d";
case String:
return "%s";
case Character:
return "%c";
case Null:
default:
return "%s";
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.List;
public class GATKReportGatherer extends Gatherer {
@Override
public void gather(List<File> inputs, File output) {
//Combines inputs GATKReport to one output
PrintStream o;
try {
o = new PrintStream(output);
} catch (FileNotFoundException e) {
throw new UserException("File to be output by CoverageByRG Gather function was not found");
}
GATKReport current = new GATKReport();
boolean isFirst = true;
for (File input : inputs) {
// If the table is empty
if (isFirst) {
current = new GATKReport(input);
isFirst = false;
} else {
GATKReport toAdd = new GATKReport(input);
current.combineWith(toAdd);
}
}
current.print(o);
}
}

View File

@ -1,103 +1,52 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.report;
import org.apache.commons.lang.ObjectUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A data structure that allows data to be collected over the course of a walker's computation, then have that data
* written to a PrintStream such that it's human-readable, AWK-able, and R-friendly (given that you load it using the
* GATKReport loader module).
*
* The goal of this object is to use the same data structure for both accumulating data during a walker's computation
* and emitting that data to a file for easy analysis in R (or any other program/language that can take in a table of
* results). Thus, all of the infrastructure below is designed simply to make printing the following as easy as
* possible:
*
* ##:GATKReport.v0.1 ErrorRatePerCycle : The error rate per sequenced position in the reads
* cycle errorrate.61PA8.7 qualavg.61PA8.7
* 0 0.007451835696110506 25.474613284804366
* 1 0.002362777171937477 29.844949954504095
* 2 9.087604507451836E-4 32.87590975254731
* 3 5.452562704471102E-4 34.498999090081895
* 4 9.087604507451836E-4 35.14831665150137
* 5 5.452562704471102E-4 36.07223435225619
* 6 5.452562704471102E-4 36.1217248908297
* 7 5.452562704471102E-4 36.1910480349345
* 8 5.452562704471102E-4 36.00345705967977
*
* Here, we have a GATKReport table - a well-formatted, easy to read representation of some tabular data. Every single
* table has this same GATKReport.v0.1 header, which permits multiple files from different sources to be cat-ed
* together, which makes it very easy to pull tables from different programs into R via a single file.
*
* ------------
* Definitions:
*
* Table info:
* The first line, structured as
* ##:<report version> <table name> : <table description>
*
* Table header:
* The second line, specifying a unique name for each column in the table.
*
* The first column mentioned in the table header is the "primary key" column - a column that provides the unique
* identifier for each row in the table. Once this column is created, any element in the table can be referenced by
* the row-column coordinate, i.e. "primary key"-"column name" coordinate.
*
* When a column is added to a table, a default value must be specified (usually 0). This is the initial value for
* an element in a column. This permits operations like increment() and decrement() to work properly on columns that
* are effectively counters for a particular event.
*
* Finally, the display property for each column can be set during column creation. This is useful when a given
* column stores an intermediate result that will be used later on, perhaps to calculate the value of another column.
* In these cases, it's obviously necessary to store the value required for further computation, but it's not
* necessary to actually print the intermediate column.
*
* Table body:
* The values of the table itself.
*
* ---------------
* Implementation:
*
* The implementation of this table has two components:
* 1. A TreeSet<Object> that stores all the values ever specified for the primary key. Any get() operation that
* refers to an element where the primary key object does not exist will result in its implicit creation. I
* haven't yet decided if this is a good idea...
*
* 2. A HashMap<String, GATKReportColumn> that stores a mapping from column name to column contents. Each
* GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap<Object, Object>) between
* primary key and the column value. This means that, given N columns, the primary key information is stored
* N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations.
*
* ------------------------------
* Element and column operations:
*
* In addition to simply getting and setting values, this object also permits some simple operations to be applied to
* individual elements or to whole columns. For instance, an element can be easily incremented without the hassle of
* calling get(), incrementing the obtained value by 1, and then calling set() with the new value. Also, some vector
* operations are supported. For instance, two whole columns can be divided and have the result be set to a third
* column. This is especially useful when aggregating counts in two intermediate columns that will eventually need to
* be manipulated row-by-row to compute the final column.
*
* Note: I've made no attempt whatsoever to make these operations efficient. Right now, some of the methods check the
* type of the stored object using an instanceof call and attempt to do the right thing. Others cast the contents of
* the cell to a Number, call the Number.toDouble() method and compute a result. This is clearly not the ideal design,
* but at least the prototype contained herein works.
*
* @author Kiran Garimella
* @author Khalid Shakir
*/
public class GATKReportTable {
/** REGEX that matches any table with an invalid name */
public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2;
/**
* REGEX that matches any table with an invalid name
*/
public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable";
private static final String SEPARATOR = ":";
private static final String ENDLINE = ":;";
private String tableName;
private String tableDescription;
private GATKReportVersion version = LATEST_REPORT_VERSION;
private String primaryKeyName;
private Collection<Object> primaryKeyColumn;
@ -106,11 +55,118 @@ public class GATKReportTable {
private GATKReportColumns columns;
private static final String COULD_NOT_READ_HEADER = "Could not read the header of this file -- ";
private static final String COULD_NOT_READ_COLUMN_NAMES = "Could not read the column names of this file -- ";
private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- ";
private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- ";
private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables";
private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: ";
public GATKReportTable(BufferedReader reader, GATKReportVersion version) {
int counter = 0;
switch (version) {
case V1_0:
int nHeaders = 2;
String[] tableHeaders = new String[nHeaders];
// Read in the headers
for (int i = 0; i < nHeaders; i++) {
try {
tableHeaders[i] = reader.readLine();
} catch (IOException e) {
throw new ReviewedStingException(COULD_NOT_READ_HEADER + e.getMessage());
}
}
String[] tableData = tableHeaders[0].split(":");
String[] userData = tableHeaders[1].split(":");
// Fill in the fields
tableName = userData[2];
tableDescription = (userData.length <= 3) ? "" : userData[3]; // table may have no description! (and that's okay)
primaryKeyDisplay = Boolean.parseBoolean(tableData[2]);
columns = new GATKReportColumns();
int nColumns = Integer.parseInt(tableData[3]);
int nRows = Integer.parseInt(tableData[4]);
// Read column names
String columnLine;
try {
columnLine = reader.readLine();
} catch (IOException e) {
throw new ReviewedStingException(COULD_NOT_READ_COLUMN_NAMES);
}
List<Integer> columnStarts = TextFormattingUtils.getWordStarts(columnLine);
String[] columnNames = TextFormattingUtils.splitFixedWidth(columnLine, columnStarts);
if (primaryKeyDisplay) {
addPrimaryKey(columnNames[0]);
} else {
sortByPrimaryKey = true;
addPrimaryKey("id", false);
counter = 1;
}
// Put in columns using the format string from the header
for (int i = 0; i < nColumns; i++) {
String format = tableData[5 + i];
if (primaryKeyDisplay)
addColumn(columnNames[i + 1], true, format);
else
addColumn(columnNames[i], true, format);
}
for (int i = 0; i < nRows; i++) {
// read line
String dataLine;
try {
dataLine = reader.readLine();
} catch (IOException e) {
throw new ReviewedStingException(COULD_NOT_READ_DATA_LINE + e.getMessage());
}
List<String> lineSplits = Arrays.asList(TextFormattingUtils.splitFixedWidth(dataLine, columnStarts));
for (int columnIndex = 0; columnIndex < nColumns; columnIndex++) {
//Input all the remaining values
GATKReportDataType type = getColumns().getByIndex(columnIndex).getDataType();
if (primaryKeyDisplay) {
String columnName = columnNames[columnIndex + 1];
String primaryKey = lineSplits.get(0);
set(primaryKey, columnName, type.Parse(lineSplits.get(columnIndex + 1)));
} else {
String columnName = columnNames[columnIndex];
set(counter, columnName, type.Parse(lineSplits.get(columnIndex)));
}
}
counter++;
}
try {
reader.readLine();
} catch (IOException e) {
throw new ReviewedStingException(COULD_NOT_READ_EMPTY_LINE + e.getMessage());
}
break;
default:
throw new ReviewedStingException(OLD_GATK_TABLE_VERSION);
}
}
/**
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
*
* @param name the name of the table or column
* @return true if the name is valid, false if otherwise
* @param name the name of the table or column
* @return true if the name is valid, false if otherwise
*/
private boolean isValidName(String name) {
Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX);
@ -122,8 +178,8 @@ public class GATKReportTable {
/**
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
*
* @param description the name of the table or column
* @return true if the name is valid, false if otherwise
* @param description the name of the table or column
* @return true if the name is valid, false if otherwise
*/
private boolean isValidDescription(String description) {
Pattern p = Pattern.compile("\\r|\\n");
@ -135,15 +191,23 @@ public class GATKReportTable {
/**
* Construct a new GATK report table with the specified name and description
*
* @param tableName the name of the table
* @param tableDescription the description of the table
* @param tableName the name of the table
* @param tableDescription the description of the table
*/
public GATKReportTable(String tableName, String tableDescription) {
this(tableName, tableDescription, true);
}
/**
* Construct a new GATK report table with the specified name and description and whether to sort rows by the primary
* key
*
* @param tableName the name of the table
* @param tableDescription the description of the table
* @param sortByPrimaryKey whether to sort rows by the primary key (instead of order added)
*/
public GATKReportTable(String tableName, String tableDescription, boolean sortByPrimaryKey) {
if (!isValidName(tableName)) {
if (!isValidName(tableName)) {
throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed.");
}
@ -158,28 +222,21 @@ public class GATKReportTable {
columns = new GATKReportColumns();
}
public GATKReportVersion getVersion() {
return version;
}
protected void setVersion(GATKReportVersion version) {
this.version = version;
}
/**
* Add a primary key column. This becomes the unique identifier for every column in the table.
*
* @param primaryKeyName the name of the primary key column
* @param primaryKeyName the name of the primary key column
*/
public void addPrimaryKey(String primaryKeyName) {
addPrimaryKey(primaryKeyName, true);
}
/**
* Add an optionally visible primary key column. This becomes the unique identifier for every column in the table, and will always be printed as the first column.
* Add an optionally visible primary key column. This becomes the unique identifier for every column in the table,
* and will always be printed as the first column.
*
* @param primaryKeyName the name of the primary key column
* @param display should this primary key be displayed?
* @param primaryKeyName the name of the primary key column
* @param display should this primary key be displayed?
*/
public void addPrimaryKey(String primaryKeyName, boolean display) {
if (!isValidName(primaryKeyName)) {
@ -193,49 +250,40 @@ public class GATKReportTable {
}
/**
* Returns the first primary key matching the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
* @param dottedColumnValues Period concatenated values.
* Returns the first primary key matching the column values.
* Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all"
* @param columnValues column values.
* @return The first primary key matching the column values or throws an exception.
*/
public Object getPrimaryKey(String dottedColumnValues) {
Object key = findPrimaryKey(dottedColumnValues);
public Object getPrimaryKeyByData(Object... columnValues) {
Object key = findPrimaryKeyByData(columnValues);
if (key == null)
throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues);
throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + Arrays.asList(columnValues));
return key;
}
/**
* Returns true if there is at least on row with the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
* @param dottedColumnValues Period concatenated values.
* @return true if there is at least one row matching the columns.
*/
public boolean containsPrimaryKey(String dottedColumnValues) {
return findPrimaryKey(dottedColumnValues) != null;
}
/**
* Returns the first primary key matching the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
* @param dottedColumnValues Period concatenated values.
* @return The first primary key matching the column values or null.
*/
private Object findPrimaryKey(String dottedColumnValues) {
return findPrimaryKey(dottedColumnValues.split("\\."));
}
/**
* Returns the first primary key matching the column values.
* Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" }
* Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all"
*
* @param columnValues column values.
* @return The first primary key matching the column values.
* @return The first primary key matching the column values or null if the key does not exist.
*/
private Object findPrimaryKey(Object[] columnValues) {
public Object findPrimaryKeyByData(Object... columnValues) {
if (columnValues == null)
throw new NullPointerException("Column values is null");
if (columnValues.length == 0)
throw new IllegalArgumentException("Column values is empty");
int columnCount = columns.size();
for (Object primaryKey : primaryKeyColumn) {
boolean matching = true;
for (int i = 0; matching && i < columnValues.length; i++) {
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1));
// i --> index into columnValues parameter
// j --> index into columns collection
for (int i = 0, j = 0; matching && i < columnValues.length && j < columnCount; j++) {
if (!columns.getByIndex(j).isDisplayable())
continue;
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i));
i++;
}
if (matching)
return primaryKey;
@ -244,29 +292,65 @@ public class GATKReportTable {
}
/**
* Add a column to the report and specify the default value that should be supplied if a given position in the table is never explicitly set.
* Add a column to the report and specify the default value that should be supplied if a given position in the table
* is never explicitly set.
*
* @param columnName the name of the column
* @param defaultValue the default value for the column
* @param columnName the name of the column
* @param defaultValue the default value for the column
*/
public void addColumn(String columnName, Object defaultValue) {
addColumn(columnName, defaultValue, null);
addColumn(columnName, defaultValue, true);
}
/**
* Add a column to the report, specify the default column value, and specify whether the column should be displayed
* in the final output (useful when intermediate columns are necessary for later calculations, but are not required
* to be in the output file.
*
* @param columnName the name of the column
* @param defaultValue the default value of the column
* @param display if true - the column will be displayed; if false - the column will be hidden
*/
public void addColumn(String columnName, Object defaultValue, boolean display) {
addColumn(columnName, defaultValue, display, "");
}
/**
* Add a column to the report, specify the default column value, and specify whether the column should be displayed
* in the final output (useful when intermediate columns are necessary for later calculations, but are not required
* to be in the output file.
*
* @param columnName the name of the column
* @param defaultValue the default value of the column
* @param format the format string used to display data
*/
public void addColumn(String columnName, Object defaultValue, String format) {
addColumn(columnName, defaultValue, true, format);
}
/**
* Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file.
* Add a column to the report, specify whether the column should be displayed in the final output (useful when
* intermediate columns are necessary for later calculations, but are not required to be in the output file), and the
* format string used to display the data.
*
* @param columnName the name of the column
* @param defaultValue the default value of the column
* @param display if true - the column will be displayed; if false - the column will be hidden
* @param columnName the name of the column
* @param display if true - the column will be displayed; if false - the column will be hidden
* @param format the format string used to display data
*/
public void addColumn(String columnName, Object defaultValue, boolean display) {
addColumn(columnName, defaultValue, display, null);
public void addColumn(String columnName, boolean display, String format) {
addColumn(columnName, null, display, format);
}
/**
* Add a column to the report, specify the default column value, whether the column should be displayed in the final
* output (useful when intermediate columns are necessary for later calculations, but are not required to be in the
* output file), and the format string used to display the data.
*
* @param columnName the name of the column
* @param defaultValue if true - the column will be displayed; if false - the column will be hidden
* @param display display the column
* @param format the format string used to display data
*/
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {
if (!isValidName(columnName)) {
throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed.");
@ -277,8 +361,8 @@ public class GATKReportTable {
/**
* Check if the requested element exists, and if not, create it.
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param primaryKey the primary key value
* @param columnName the name of the column
*/
private void verifyEntry(Object primaryKey, String columnName) {
if (!columns.containsKey(columnName)) {
@ -303,26 +387,67 @@ public class GATKReportTable {
/**
* Set the value for a given position in the table
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param value the value to set
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param value the value to set
*/
public void set(Object primaryKey, String columnName, Object value) {
verifyEntry(primaryKey, columnName);
GATKReportColumn column = columns.get(columnName);
//todo -- Check if value is of same type as column
columns.get(columnName).put(primaryKey, value);
// We do not accept internal null values
if (value == null)
value = "null";
// This code below is bs. Why am do I have to conform to bad code
// Below is some code to convert a string into its appropriate type.
// I second Roger's rant!
// If we got a string but the column is not a String type
Object newValue = null;
if (value instanceof String && !column.getDataType().equals(GATKReportDataType.String)) {
// Integer case
if (column.getDataType().equals(GATKReportDataType.Integer)) {
try {
newValue = Long.parseLong((String) value);
} catch (Exception e) {
/** do nothing */
}
}
if (column.getDataType().equals(GATKReportDataType.Decimal)) {
try {
newValue = Double.parseDouble((String) value);
} catch (Exception e) {
/** do nothing */
}
}
if (column.getDataType().equals(GATKReportDataType.Character) && ((String) value).length() == 1) {
newValue = ((String) value).charAt(0);
}
}
if (newValue != null)
value = newValue;
// todo -- Types have to be more flexible. For example, %d should accept Integers, Shorts and Bytes.
if (column.getDataType().equals(GATKReportDataType.fromObject(value)) || column.getDataType().equals(GATKReportDataType.Unknown) )
columns.get(columnName).put(primaryKey, value);
else
throw new ReviewedStingException(String.format("Tried to add an object of type: %s to a column of type: %s", GATKReportDataType.fromObject(value).name(), column.getDataType().name()));
}
/**
* Get a value from the given position in the table
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @return the value stored at the specified position in the table
* @param primaryKey the primary key value
* @param columnName the name of the column
* @return the value stored at the specified position in the table
*/
public Object get(Object primaryKey, String columnName) {
verifyEntry(primaryKey, columnName);
return columns.get(columnName).get(primaryKey);
}
@ -331,7 +456,7 @@ public class GATKReportTable {
*
* @param primaryKey the primary key value
* @param columnIndex the index of the column
* @return the value stored at the specified position in the table
* @return the value stored at the specified position in the table
*/
private Object get(Object primaryKey, int columnIndex) {
return columns.getByIndex(columnIndex).get(primaryKey);
@ -340,8 +465,8 @@ public class GATKReportTable {
/**
* Increment an element in the table. This implementation is awful - a functor would probably be better.
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param primaryKey the primary key value
* @param columnName the name of the column
*/
public void increment(Object primaryKey, String columnName) {
Object oldValue = get(primaryKey, columnName);
@ -369,8 +494,8 @@ public class GATKReportTable {
/**
* Decrement an element in the table. This implementation is awful - a functor would probably be better.
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param primaryKey the primary key value
* @param columnName the name of the column
*/
public void decrement(Object primaryKey, String columnName) {
Object oldValue = get(primaryKey, columnName);
@ -398,9 +523,9 @@ public class GATKReportTable {
/**
* Add the specified value to an element in the table
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToAdd the value to add
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToAdd the value to add
*/
public void add(Object primaryKey, String columnName, Object valueToAdd) {
Object oldValue = get(primaryKey, columnName);
@ -428,8 +553,8 @@ public class GATKReportTable {
/**
* Subtract the specified value from an element in the table
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToSubtract the value to subtract
*/
public void subtract(Object primaryKey, String columnName, Object valueToSubtract) {
@ -458,9 +583,9 @@ public class GATKReportTable {
/**
* Multiply the specified value to an element in the table
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToMultiply the value to multiply by
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToMultiply the value to multiply by
*/
public void multiply(Object primaryKey, String columnName, Object valueToMultiply) {
Object oldValue = get(primaryKey, columnName);
@ -488,9 +613,9 @@ public class GATKReportTable {
/**
* Divide the specified value from an element in the table
*
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToDivide the value to divide by
* @param primaryKey the primary key value
* @param columnName the name of the column
* @param valueToDivide the value to divide by
*/
public void divide(Object primaryKey, String columnName, Object valueToDivide) {
Object oldValue = get(primaryKey, columnName);
@ -518,9 +643,9 @@ public class GATKReportTable {
/**
* Add two columns to each other and set the results to a third column
*
* @param columnToSet the column that should hold the results
* @param augend the column that shall be the augend
* @param addend the column that shall be the addend
* @param columnToSet the column that should hold the results
* @param augend the column that shall be the augend
* @param addend the column that shall be the addend
*/
public void addColumns(String columnToSet, String augend, String addend) {
for (Object primaryKey : primaryKeyColumn) {
@ -536,8 +661,8 @@ public class GATKReportTable {
/**
* Subtract one column from another and set the results to a third column
*
* @param columnToSet the column that should hold the results
* @param minuend the column that shall be the minuend (the a in a - b)
* @param columnToSet the column that should hold the results
* @param minuend the column that shall be the minuend (the a in a - b)
* @param subtrahend the column that shall be the subtrahend (the b in a - b)
*/
public void subtractColumns(String columnToSet, String minuend, String subtrahend) {
@ -555,8 +680,8 @@ public class GATKReportTable {
* Multiply two columns by each other and set the results to a third column
*
* @param columnToSet the column that should hold the results
* @param multiplier the column that shall be the multiplier
* @param multiplicand the column that shall be the multiplicand
* @param multiplier the column that shall be the multiplier
* @param multiplicand the column that shall be the multiplicand
*/
public void multiplyColumns(String columnToSet, String multiplier, String multiplicand) {
for (Object primaryKey : primaryKeyColumn) {
@ -572,9 +697,9 @@ public class GATKReportTable {
/**
* Divide two columns by each other and set the results to a third column
*
* @param columnToSet the column that should hold the results
* @param numeratorColumn the column that shall be the numerator
* @param denominatorColumn the column that shall be the denominator
* @param columnToSet the column that should hold the results
* @param numeratorColumn the column that shall be the numerator
* @param denominatorColumn the column that shall be the denominator
*/
public void divideColumns(String columnToSet, String numeratorColumn, String denominatorColumn) {
for (Object primaryKey : primaryKeyColumn) {
@ -589,10 +714,11 @@ public class GATKReportTable {
/**
* Return the print width of the primary key column
* @return the width of the primary key column
*
* @return the width of the primary key column
*/
public int getPrimaryKeyColumnWidth() {
int maxWidth = primaryKeyName.length();
int getPrimaryKeyColumnWidth() {
int maxWidth = getPrimaryKeyName().length();
for (Object primaryKey : primaryKeyColumn) {
int width = primaryKey.toString().length();
@ -608,30 +734,47 @@ public class GATKReportTable {
/**
* Write the table to the PrintStream, formatted nicely to be human-readable, AWK-able, and R-friendly.
*
* @param out the PrintStream to which the table should be written
* @param out the PrintStream to which the table should be written
*/
public void write(PrintStream out) {
void write(PrintStream out) {
/*
* Table header:
* #:GATKTable:nColumns:nRows:(DataType for each column):;
* #:GATKTable:TableName:Description :;
* key colA colB
* row1 xxxx xxxxx
*/
// Get the column widths for everything
HashMap<String, GATKReportColumnFormat> columnFormats = new HashMap<String, GATKReportColumnFormat>();
for (String columnName : columns.keySet()) {
columnFormats.put(columnName, columns.get(columnName).getColumnFormat());
}
String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s";
// Emit the table definition
out.printf("##:GATKReport.%s %s : %s%n", LATEST_REPORT_VERSION.versionString, tableName, tableDescription);
String formatHeader = String.format(GATKTABLE_HEADER_PREFIX + ":%b:%d:%d", primaryKeyDisplay, getColumns().size(), getNumRows());
// Add all the formats for all the columns
for (GATKReportColumn column : getColumns()) {
if (column.isDisplayable())
formatHeader += (SEPARATOR + column.getFormat());
}
out.println(formatHeader + ENDLINE);
out.printf(GATKTABLE_HEADER_PREFIX + ":%s:%s\n", tableName, tableDescription);
//out.printf("#:GATKTable:%s:%s", Algorithm);
// Emit the table header, taking into account the padding requirement if the primary key is a hidden column
boolean needsPadding = false;
if (primaryKeyDisplay) {
out.printf(primaryKeyFormat, primaryKeyName);
out.printf(primaryKeyFormat, getPrimaryKeyName());
needsPadding = true;
}
for (String columnName : columns.keySet()) {
if (columns.get(columnName).isDisplayable()) {
if (needsPadding) { out.printf(" "); }
out.printf(columnFormats.get(columnName).getNameFormat(), columnName);
if (needsPadding) {
out.printf(" ");
}
out.printf(columns.get(columnName).getColumnFormat().getNameFormat(), columnName);
needsPadding = true;
}
@ -640,28 +783,31 @@ public class GATKReportTable {
out.printf("%n");
// Emit the table body
for (Object primaryKey : primaryKeyColumn) {
for (final Object primaryKey : primaryKeyColumn) {
needsPadding = false;
if (primaryKeyDisplay) {
out.printf(primaryKeyFormat, primaryKey);
needsPadding = true;
}
for (String columnName : columns.keySet()) {
if (columns.get(columnName).isDisplayable()) {
if (needsPadding) { out.printf(" "); }
String value = columns.get(columnName).getStringValue(primaryKey);
out.printf(columnFormats.get(columnName).getValueFormat(), value);
for (final Map.Entry<String, GATKReportColumn> entry : columns.entrySet()) {
final GATKReportColumn column = entry.getValue();
if (column.isDisplayable()) {
if (needsPadding) {
out.print(" ");
}
final String value = column.getStringValue(primaryKey);
out.printf(column.getColumnFormat().getValueFormat(), value);
needsPadding = true;
}
}
out.printf("%n");
out.println();
}
// Close the table
out.printf("%n");
out.println();
}
public int getNumRows() {
@ -679,4 +825,90 @@ public class GATKReportTable {
public GATKReportColumns getColumns() {
return columns;
}
/**
* Combines two compatible GATK report tables. This is the general function which will call the different algorithms
* necessary to gather the tables. Every column's combine algorithm is read and treated accordingly.
*
* @param input Another GATK table
*/
void combineWith(GATKReportTable input) {
/*
* This function is different from addRowsFrom because we will add the ability to sum,average, etc rows
* TODO: Add other combining algorithms
*/
// Make sure the columns match AND the Primary Key
if (input.getColumns().keySet().equals(this.getColumns().keySet()) &&
input.getPrimaryKeyName().equals(this.getPrimaryKeyName())) {
this.addRowsFrom(input);
} else
throw new ReviewedStingException("Failed to combine GATKReportTable, columns don't match!");
}
/**
* A gather algorithm that simply takes the rows from the argument, and adds them to the current table. This is the
* default gather algorithm.
*
* @param input Another GATK table to add rows from.
*/
private void addRowsFrom(GATKReportTable input) {
// add column by column
// For every column
for (String columnKey : input.getColumns().keySet()) {
GATKReportColumn current = this.getColumns().get(columnKey);
GATKReportColumn toAdd = input.getColumns().get(columnKey);
// We want to take the current column and add all the values from input
// The column is a map of values <Key, Value>
for (Object rowKey : toAdd.keySet()) {
// We add every value from toAdd to the current
if (!current.containsKey(rowKey)) {
this.set(rowKey, columnKey, toAdd.get(rowKey));
//System.out.printf("Putting row with PK: %s \n", rowKey);
} else {
this.set(rowKey, columnKey, toAdd.get(rowKey));
System.out.printf("OVERWRITING Row with PK: %s \n", rowKey);
}
}
}
}
public String getPrimaryKeyName() {
return primaryKeyName;
}
/**
* Returns whether or not the two tables have the same format including columns and everything in between. This does
* not check if the data inside is the same. This is the check to see if the two tables are gatherable or
* reduceable
*
* @param table another GATK table
* @return true if the the tables are gatherable
*/
public boolean isSameFormat(GATKReportTable table) {
//Should we add the sortByPrimaryKey as a check?
return columns.isSameFormat(table.columns) &&
(primaryKeyDisplay == table.primaryKeyDisplay && primaryKeyName.equals(table.primaryKeyName) &&
tableName.equals(table.tableName) &&
tableDescription.equals(table.tableDescription));
}
/**
* Checks that the tables are exactly the same.
*
* @param table another GATK report
* @return true if all field in the reports, tables, and columns are equal.
*/
public boolean equals(GATKReportTable table) {
return isSameFormat(table) &&
(columns.equals(table.columns) &&
primaryKeyColumn.equals(table.primaryKeyColumn) &&
sortByPrimaryKey == table.sortByPrimaryKey);
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011, The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -31,7 +31,7 @@ public enum GATKReportVersion {
* Differences between other versions:
* - Does not allow spaces in cells.
* - Mostly fixed width but has a bug where the string width of floating point
* values was not measured correctly leading to columns that aren't aligned
* values was not measured correctly leading to columns that aren't aligned
*/
V0_1("v0.1"),
@ -40,9 +40,17 @@ public enum GATKReportVersion {
* - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6".
* - Fixed width fixed for floating point values
*/
V0_2("v0.2");
V0_2("v0.2"),
public final String versionString;
/*
* Differences between v0.x
* - Added table and report headers
* - Headers changed format, include the numbe rof tables, rows, and metadata for gathering
* - IS GATHERABLE
*/
V1_0("v1.0");
private final String versionString;
private GATKReportVersion(String versionString) {
this.versionString = versionString;
@ -53,8 +61,13 @@ public enum GATKReportVersion {
return versionString;
}
public boolean equals(GATKReportVersion that) {
return (versionString.equals(that.versionString));
}
/**
* Returns the GATK Report Version from the file header.
*
* @param header Header from the file starting with ##:GATKReport.v[version]
* @return The version as an enum.
*/
@ -65,6 +78,9 @@ public enum GATKReportVersion {
if (header.startsWith("##:GATKReport.v0.2 "))
return GATKReportVersion.V0_2;
if (header.startsWith("#:GATKReport.v1.0"))
return GATKReportVersion.V1_0;
throw new ReviewedStingException("Unknown GATK report version in header: " + header);
}
}

View File

@ -235,4 +235,14 @@ public class SampleDB {
}
return children;
}
public Set<String> getFounderIds(){
Set<String> founders = new HashSet<String>();
for(Sample sample : getSamples()){
if(sample.getParents().size()<1)
founders.add(sample.getID());
}
return founders;
}
}

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -42,38 +43,32 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
final LocusView locusView = getLocusView( walker, dataProvider );
final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE;
final ArrayList<Double> isActiveList = new ArrayList<Double>();
GenomeLoc firstIsActiveStart = null;
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
else
referenceOrderedDataView = (RodLocusView)locusView;
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
if(prevLoc != null) {
for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) {
// fill in the active / inactive labels from the stop of the previous location to the start of this location
// TODO refactor to separate function
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) );
isActiveList.add( isActiveProb );
if( firstIsActiveStart == null ) {
firstIsActiveStart = fakeLoc;
}
final double isActiveProb = ( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 );
profile.add(fakeLoc, isActiveProb);
}
}
}
@ -89,12 +84,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus )
: ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) );
isActiveList.add( isActiveProb );
if( firstIsActiveStart == null ) {
firstIsActiveStart = location;
}
final double isActiveProb = walkerActiveProb(walker, tracker, refContext, locus, location);
profile.add(location, isActiveProb);
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
@ -103,52 +94,100 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if( !myReads.contains(read) ) {
myReads.add(read);
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
minStart = Math.min(minStart, read.getAlignmentStart());
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
if( !locusView.hasNext() ) {
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
}
}
prevLoc = location;
printProgress(dataProvider.getShard(), locus.getLocation());
}
// Take the individual isActive calls and integrate them into contiguous active regions and
// add these blocks of work to the work queue
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
if( walker.activeRegionOutStream == null ) {
workQueue.addAll( activeRegions );
} else { // Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : activeRegions ) {
if( activeRegion.isActive ) {
walker.activeRegionOutStream.println( activeRegion.getLocation() );
}
}
}
// band-pass filter the list of isActive probabilities and turn into active regions
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
}
// add active regions to queue of regions to process
workQueue.addAll( activeRegions );
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
}
return sum;
}
// Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
public T endTraversal( final Walker<M,T> walker, T sum) {
// --------------------------------------------------------------------------------
//
// simple utility functions
//
// --------------------------------------------------------------------------------
private final double walkerActiveProb(final ActiveRegionWalker<M,T> walker,
final RefMetaDataTracker tracker, final ReferenceContext refContext,
final AlignmentContext locus, final GenomeLoc location) {
if ( walker.hasPresetActiveRegions() ) {
return walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0;
} else {
return walker.isActive( tracker, refContext, locus );
}
}
private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker<M,T> walker,
final LocusShardDataProvider dataProvider,
final LocusView locusView) {
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
return new ManagingReferenceOrderedView( dataProvider );
else
return (RodLocusView)locusView;
}
// --------------------------------------------------------------------------------
//
// code to handle processing active regions
//
// --------------------------------------------------------------------------------
private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
}
}
/**
* Write out each active region to the walker activeRegionOutStream
*
* @param walker
*/
private void writeActiveRegionsToStream( final ActiveRegionWalker<M,T> walker ) {
// Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : workQueue ) {
if( activeRegion.isActive ) {
walker.activeRegionOutStream.println( activeRegion.getLocation() );
}
}
}
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker<M,T>) walker );
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
} else {
break;
}
}
return sum;
@ -193,6 +232,12 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
return walker.reduce( x, sum );
}
// --------------------------------------------------------------------------------
//
// engine interaction code
//
// --------------------------------------------------------------------------------
/**
* Gets the best view of loci for this walker given the available data.
* @param walker walker to interrogate.
@ -211,48 +256,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
// band-pass filter the list of isActive probabilities and turn into active regions
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) {
final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
if( activeList.size() == 0 ) {
return returnList;
} else if( activeList.size() == 1 ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()),
activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) );
return returnList;
} else {
final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
final double[] filteredProbArray = new double[activeProbArray.length];
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
double maxVal = 0;
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) {
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
}
filteredProbArray[iii] = maxVal;
}
boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD;
int curStart = 0;
for(int iii = 1; iii < filteredProbArray.length; iii++ ) {
final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD;
if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
returnList.add( new ActiveRegion(
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)),
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
curStatus = thisStatus;
curStart = iii;
}
}
if( curStart != filteredProbArray.length-1 ) {
returnList.add( new ActiveRegion(
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)),
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
}
return returnList;
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal( final Walker<M,T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M,T>)walker, sum, Integer.MAX_VALUE, null);
}
}

View File

@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy;
public @interface ActiveRegionExtension {
public int extension() default 0;
public int maxRegion() default 1500;
}

View File

@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -33,8 +30,8 @@ import java.util.List;
@By(DataSource.READS)
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@ActiveRegionExtension(extension=50,maxRegion=1500)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
@ -45,6 +42,10 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
public GenomeLocSortedSet presetActiveRegions = null;
public boolean hasPresetActiveRegions() {
return presetActiveRegions != null;
}
@Override
public void initialize() {
if( activeRegionBindings == null ) { return; }

View File

@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -65,8 +66,8 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
@Output
PrintStream out;
@Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events")
public boolean SHOW_INDEL_PILEUPS = false;
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output")
public boolean SHOW_VERBOSE = false;
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
@ -74,28 +75,18 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
public void initialize() {
}
public boolean generateExtendedEvents() { return SHOW_INDEL_PILEUPS; }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String rods = getReferenceOrderedData( tracker );
if ( context.hasBasePileup() ) {
ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s%n", basePileup.getPileupString(ref.getBaseAsChar()), rods);
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
}
if ( context.hasExtendedEventPileup() ) {
ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
List<Pair<String,Integer>> eventCounts = indelPileup.getEventStringsWithCounts(ref.getBases());
out.printf("%s %s ", indelPileup.getShortPileupString(), rods);
int i = 0;
for ( ; i < eventCounts.size() - 1 ; i++ ) {
out.printf("%s:%d,",eventCounts.get(i).first,eventCounts.get(i).second);
}
out.printf("%s:%d%n",eventCounts.get(i).first,eventCounts.get(i).second);
}
return 1;
}
@ -126,6 +117,31 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
return rodString;
}
private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names
private static String createVerboseOutput(final ReadBackedPileup pileup) {
final StringBuilder sb = new StringBuilder();
boolean isFirst = true;
sb.append(pileup.getNumberOfDeletions());
sb.append(" ");
for ( PileupElement p : pileup ) {
if ( isFirst )
isFirst = false;
else
sb.append(",");
sb.append(p.getRead().getReadName());
sb.append(verboseDelimiter);
sb.append(p.getOffset());
sb.append(verboseDelimiter);
sb.append(p.getRead().getReadLength());
sb.append(verboseDelimiter);
sb.append(p.getRead().getMappingQuality());
}
return sb.toString();
}
@Override
public void onTraversalDone(Integer result) {
out.println("[REDUCE RESULT] Traversal result is: " + result);

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -91,7 +90,7 @@ import java.util.TreeSet;
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
public class PrintReadsWalker extends ReadWalker<GATKSAMRecord, SAMFileWriter> {
@Output(doc="Write output to this BAM filename instead of STDOUT")
SAMFileWriter out;
@ -129,6 +128,13 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> sampleNames = new TreeSet<String>();
/**
* Erase all extra attributes in the read but keep the read group information
*/
@Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false)
public boolean simplifyReads = false;
private TreeSet<String> samplesToChoose = new TreeSet<String>();
private boolean SAMPLES_SPECIFIED = false;
@ -162,7 +168,7 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
* The reads filter function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @param read the read itself, as a GATKSAMRecord
* @return true if the read passes the filter, false if it doesn't
*/
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
@ -208,11 +214,11 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
* The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord
* @param read the read itself, as a GATKSAMRecord
* @return the read itself
*/
public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
return read;
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
return simplifyReads ? read.simplify() : read;
}
/**
@ -232,7 +238,7 @@ public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
* @param output the output source
* @return the SAMFileWriter, so that the next reduce can emit to the same source
*/
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) {
output.addAlignment(read);
return output;
}

View File

@ -114,35 +114,6 @@ public abstract class Walker<MapType, ReduceType> {
return false;
}
/**
* This method states whether you want to see pileups of "extended events" (currently, indels only)
* at every locus that has at least one indel associated with it. Consider the following situation:
*
* ref: AT--CTGA (note that we expanded the ref here with -- to accomodate insertion in read3)
* read1: AT--CTGA (perfectly matches the ref)
* read2: AT----GA (deletion -CT w.r.t. the ref)
* read3: ATGGCTGA (insertion +GG w.r.t the ref)
*
* Normally, the locus iterator only returns read base pileups over reference bases, optionally with deleted bases
* included (see #includeReadsWithDeletionAtLoci()). In other words, the pileup over the second reference base (T)
* will be [T,T,T] (all reads count), for the next reference base (C) the pileup will be [C,C] (or [C,-,C] if
* #includeReadsWithDeletionAtLoci() is true), next pileup generated over the next reference
* base (T) will be either [T,T], or [T,'-',T], etc. In this default mode, a) insertions are not seen by a walker at all, and
* b) deletions are (optionally) seen only on a base-by-base basis (as the step-by-step traversal over the reference
* bases is performed). In the extended event mode, however, if there is at least one indel associated with a reference
* locus, the engine will generate an <i>additional</i> call to the walker's map() method, with a pileup of
* full-length extended indel/noevent calls. This call will be made <i>after</i> the conventional base pileup call
* at that locus. Thus, in the example above, a conventional call will be first made at the second reference base (T),
* with the [T,T,T] pileup of read bases, then an extended event call will be made at the <i>same</i> locus with
* pileup [no_event, -CT, +GG] (i.e. extended events associated with that reference base). After that, the traversal
* engine will move to the next reference base.
*
* @return false if you do not want to receive extra pileups with extended events, or true if you do.
*/
public boolean generateExtendedEvents() {
return false;
}
public void initialize() { }
/**

View File

@ -30,10 +30,12 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -49,72 +51,101 @@ import java.util.Map;
*/
public class AlleleBalance extends InfoFieldAnnotation {
char[] BASES = {'A','C','G','T'};
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
if ( !vc.isBiallelic() )
return null;
final GenotypesContext genotypes = vc.getGenotypes();
if ( !vc.hasGenotypes() )
return null;
double ratio = 0.0;
double totalWeights = 0.0;
double ratioHom = 0.0;
double ratioHet = 0.0;
double weightHom = 0.0;
double weightHet = 0.0;
double overallNonDiploid = 0.0;
for ( Genotype genotype : genotypes ) {
// we care only about het calls
if ( !genotype.isHet() )
continue;
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null )
if ( context == null || !context.hasBasePileup() )
continue;
if ( vc.isSNP() && context.hasBasePileup() ) {
final String bases = new String(context.getBasePileup().getBases());
final ReadBackedPileup pileup = context.getBasePileup();
if ( vc.isSNP() ) {
final String bases = new String(pileup.getBases());
if ( bases.length() == 0 )
return null;
char refChr = vc.getReference().toString().charAt(0);
char altChr = vc.getAlternateAllele(0).toString().charAt(0);
int refCount = MathUtils.countOccurrences(refChr, bases);
int altCount = MathUtils.countOccurrences(altChr, bases);
double pTrue = 1.0 - Math.pow(10.0,genotype.getLog10PError());
if ( genotype.isHet() ) {
final char refChr = vc.getReference().toString().charAt(0);
final char altChr = vc.getAlternateAllele(0).toString().charAt(0);
// sanity check
if ( refCount + altCount == 0 )
continue;
final int refCount = MathUtils.countOccurrences(refChr, bases);
final int altCount = MathUtils.countOccurrences(altChr, bases);
final int otherCount = bases.length()-refCount-altCount;
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount));
totalWeights += genotype.getLog10PError();
} else if ( vc.isIndel() && context.hasExtendedEventPileup() ) {
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
if ( indelPileup == null ) {
continue;
// sanity check
if ( refCount + altCount == 0 )
continue;
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
ratioHet += pTrue * ((double)refCount / (double)(refCount + altCount));
weightHet += pTrue;
overallNonDiploid += ( (double) otherCount )/(bases.length()*genotypes.size());
} else if ( genotype.isHom() ) {
char alleleChr;
if ( genotype.isHomRef() ) {
alleleChr = vc.getReference().toString().charAt(0);
} else {
alleleChr = vc.getAlternateAllele(0).toString().charAt(0);
}
final int alleleCount = MathUtils.countOccurrences(alleleChr,bases);
int bestOtherCount = 0;
for ( char b : BASES ) {
if ( b == alleleChr )
continue;
int count = MathUtils.countOccurrences(b,bases);
if ( count > bestOtherCount )
bestOtherCount = count;
}
final int otherCount = bases.length() - alleleCount;
ratioHom += pTrue*( (double) alleleCount)/(alleleCount+bestOtherCount);
weightHom += pTrue;
overallNonDiploid += ((double ) otherCount)/(bases.length()*genotypes.size());
}
// todo -- actually care about indel length from the pileup (agnostic at the moment)
int refCount = indelPileup.getNumberOfElements();
int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions();
if ( refCount + altCount == 0 ) {
continue;
}
ratio += /* todo -- make not uniform */ 1 * ((double) refCount) / (double) (refCount + altCount);
totalWeights += 1;
// Allele Balance for indels was not being computed correctly (since there was no allele matching). Instead of
// prolonging the life of imperfect code, I've decided to delete it. If someone else wants to try again from
// scratch, be my guest - but make sure it's done correctly! [EB]
}
}
// make sure we had a het genotype
if ( MathUtils.compareDoubles(totalWeights, 0.0) == 0 )
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.3f", (ratio / totalWeights)));
if ( weightHet > 0.0 ) {
map.put("ABHet",ratioHet/weightHet);
}
if ( weightHom > 0.0 ) {
map.put("ABHom",ratioHom/weightHom);
}
if ( overallNonDiploid > 0.0 ) {
map.put("OND",overallNonDiploid);
}
return map;
}
public List<String> getKeyNames() { return Arrays.asList("AB"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("AB", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))")); }
public List<String> getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for hets (ref/(ref+alt))"),
new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homs (A/(A+O))"),
new VCFInfoHeaderLine("OND", 1, VCFHeaderLineType.Float, "Overall non-diploid ratio (alleles/(alleles+non-alleles))")); }
}

View File

@ -5,16 +5,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.*;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
* Note that the base quality rank sum test can not be calculated for homozygous sites.
*/
public class BaseQualityRankSumTest extends RankSumTest {
@ -31,8 +29,31 @@ public class BaseQualityRankSumTest extends RankSumTest {
altQuals.add((double)p.getQual());
}
}
}
protected void fillQualsFromPileup(final Allele ref, final List<Allele> alts, final int refLoc, final Map<Allele, List<GATKSAMRecord>> stratifiedContext, final List<Double> refQuals, final List<Double> altQuals) {
// TODO -- implement me; how do we pull out the correct offset from the read?
return;
/*
for ( final Map.Entry<Allele, List<GATKSAMRecord>> alleleBin : stratifiedContext.entrySet() ) {
final boolean matchesRef = ref.equals(alleleBin.getKey());
final boolean matchesAlt = alts.contains(alleleBin.getKey());
if ( !matchesRef && !matchesAlt )
continue;
for ( final GATKSAMRecord read : alleleBin.getValue() ) {
if ( isUsableBase(p) ) {
if ( matchesRef )
refQuals.add((double)p.getQual());
else
altQuals.add((double)p.getQual());
}
}
}
*/
}
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ?
HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();

View File

@ -25,9 +25,11 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
@ -35,13 +37,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
@ -49,17 +52,31 @@ import java.util.Map;
* allele Frequency, for each ALT allele, in the same order as listed; total number
* of alleles in called genotypes.
*/
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation {
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY };
private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"),
new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"),
new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes") };
private Set<String> founderIds = new HashSet<String>();
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( ! vc.hasGenotypes() )
return null;
return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap<String, Object>(), true,founderIds);
}
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ){
//If families were given, get the founders ids
founderIds = ((Walker)walker).getSampleDB().getFounderIds();
}
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
if ( ! vc.hasGenotypes() )
return null;
return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap<String, Object>(), true);
}

View File

@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
@ -33,7 +36,7 @@ import java.util.Map;
* Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
* -dcov D is N * D
*/
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation {
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
@ -41,7 +44,23 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : sample.getValue().getExtendedEventPileup().depthOfCoverage();
depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : 0;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", depth));
return map;
}
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
int depth = 0;
for ( final Map<Allele, List<GATKSAMRecord>> alleleBins : stratifiedContexts.values() ) {
for ( final List<GATKSAMRecord> alleleBin : alleleBins.values() ) {
depth += alleleBin.size();
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", depth));
return map;

View File

@ -9,9 +9,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
@ -44,9 +42,9 @@ import java.util.Map;
*/
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
private static String REF_ALLELE = "REF";
private static final String REF_ALLELE = "REF";
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
private static final String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
@ -62,7 +60,8 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() ) return null;
if ( ! stratifiedContext.hasBasePileup() )
return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlleles() )
@ -87,17 +86,16 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
}
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
if ( pileup == null )
return null;
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
alleleCounts.put(REF_ALLELE,0);
Allele refAllele = vc.getReference();
final HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
alleleCounts.put(REF_ALLELE, 0);
final Allele refAllele = vc.getReference();
for ( Allele allele : vc.getAlternateAlleles() ) {
@ -108,33 +106,24 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
alleleCounts.put(getAlleleRepresentation(allele), 0);
}
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
if ( e.isInsertion() ) {
for ( PileupElement p : pileup ) {
if ( p.isBeforeInsertion() ) {
final String b = e.getEventBases();
final String b = p.getEventBases();
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
} else {
if ( e.isDeletion() ) {
if ( e.getEventLength() == refAllele.length() ) {
} else if ( p.isBeforeDeletionStart() ) {
if ( p.getEventLength() == refAllele.length() ) {
// this is indeed the deletion allele recorded in VC
final String b = DEL;
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
}
// else {
// System.out.print(" deletion of WRONG length found");
// }
}
else {
if ( e.getRead().getAlignmentEnd() <= vc.getStart() ) {
continue;
}
alleleCounts.put(REF_ALLELE,alleleCounts.get(REF_ALLELE)+1);
}
} else if ( p.getRead().getAlignmentEnd() > vc.getStart() ) {
alleleCounts.put(REF_ALLELE, alleleCounts.get(REF_ALLELE)+1);
}
}

View File

@ -28,6 +28,7 @@ import cern.jet.math.Arithmetic;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -49,7 +51,7 @@ import java.util.*;
* indicative of false positive calls. Note that the fisher strand test may not be
* calculated for certain complex indel cases or for multi-allelic sites.
*/
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation {
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;
@ -78,6 +80,22 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
return map;
}
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
if ( !vc.isVariant() )
return null;
final int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
final Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE);
if ( pvalue == null )
return null;
final Map<String, Object> map = new HashMap<String, Object>();
map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
return map;
}
public List<String> getKeyNames() {
return Arrays.asList(FS);
}
@ -193,6 +211,38 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
return sum;
}
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
* allele1 # #
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getContingencyTable(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, Allele ref, Allele alt) {
int[][] table = new int[2][2];
for ( final Map<Allele, List<GATKSAMRecord>> alleleBins : stratifiedContexts.values() ) {
for ( final Map.Entry<Allele, List<GATKSAMRecord>> alleleBin : alleleBins.entrySet() ) {
final boolean matchesRef = ref.equals(alleleBin.getKey());
final boolean matchesAlt = alt.equals(alleleBin.getKey());
if ( !matchesRef && !matchesAlt )
continue;
for ( final GATKSAMRecord read : alleleBin.getValue() ) {
boolean isFW = read.getReadNegativeStrandFlag();
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column]++;
}
}
}
return table;
}
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
@ -214,8 +264,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
Allele base = Allele.create(p.getBase(), false);
boolean isFW = !p.getRead().getReadNegativeStrandFlag();
boolean matchesRef = ref.equals(base, true);
boolean matchesAlt = alt.equals(base, true);
final boolean matchesRef = ref.equals(base, true);
final boolean matchesAlt = alt.equals(base, true);
if ( matchesRef || matchesAlt ) {
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
@ -227,6 +277,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
return table;
}
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
@ -245,24 +296,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
for ( String sample : stratifiedContexts.keySet() ) {
final AlignmentContext context = stratifiedContexts.get(sample);
if ( context == null )
if ( context == null || !context.hasBasePileup() )
continue;
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup == null)
continue;
for (final PileupElement p: pileup) {
final ReadBackedPileup pileup = context.getBasePileup();
for ( final PileupElement p : pileup ) {
if ( p.getRead().isReducedRead() ) // ignore reduced reads
continue;
if ( p.getRead().getMappingQuality() < 20)
if ( p.getRead().getMappingQuality() < 20 )
continue;
if (indelLikelihoodMap.containsKey(p)) {
if ( indelLikelihoodMap.containsKey(p) ) {
// to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element.
// A pileup element then has a list of pairs of form (Allele, likelihood of this allele).
// To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles.

View File

@ -64,6 +64,9 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here
return null;
if (!vc.isSNP() && !vc.isIndel() && !vc.isMixed())
return null;
final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values());
final int contextWingSize = Math.min((ref.getWindow().size() - 1) / 2, MIN_CONTEXT_WING_SIZE);
@ -71,41 +74,27 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2;
// Compute all haplotypes consistent with the current read pileup
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup == null)
if ( !context.hasBasePileup() )
return null;
final ReadBackedPileup pileup = context.getBasePileup();
// Compute all haplotypes consistent with the current read pileup
final List<Haplotype> haplotypes = computeHaplotypes(pileup, contextSize, locus, vc);
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
if (haplotypes != null) {
for (final Genotype genotype : vc.getGenotypes()) {
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
if (thisContext != null) {
final ReadBackedPileup thisPileup;
if (thisContext.hasExtendedEventPileup())
thisPileup = thisContext.getExtendedEventPileup();
else if (thisContext.hasBasePileup())
thisPileup = thisContext.getBasePileup();
else
thisPileup = null;
if (thisPileup != null) {
if (vc.isSNP())
scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
else if (vc.isIndel() || vc.isMixed()) {
Double d = scoreIndelsAgainstHaplotypes(thisPileup);
if (d == null)
return null;
scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
} else
if (thisContext != null && thisContext.hasBasePileup()) {
final ReadBackedPileup thisPileup = thisContext.getBasePileup();
if (vc.isSNP())
scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
else if (vc.isIndel() || vc.isMixed()) {
Double d = scoreIndelsAgainstHaplotypes(thisPileup);
if (d == null)
return null;
scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense
}
}
}

View File

@ -3,12 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -27,12 +30,19 @@ import java.util.Map;
* more information. Note that the Inbreeding Coefficient will not be calculated for files
* with fewer than a minimum (generally 10) number of samples.
*/
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation {
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private static final int MIN_SAMPLES = 10;
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
return calculateIC(vc);
}
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
return calculateIC(vc);
}
private Map<String, Object> calculateIC(final VariantContext vc) {
final GenotypesContext genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
return null;

View File

@ -6,16 +6,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.*;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
* Note that the mapping quality rank sum test can not be calculated for homozygous sites.
*/
public class MappingQualityRankSumTest extends RankSumTest {
@ -35,6 +33,23 @@ public class MappingQualityRankSumTest extends RankSumTest {
}
}
}
protected void fillQualsFromPileup(final Allele ref, final List<Allele> alts, final int refLoc, final Map<Allele, List<GATKSAMRecord>> stratifiedContext, final List<Double> refQuals, final List<Double> altQuals) {
for ( final Map.Entry<Allele, List<GATKSAMRecord>> alleleBin : stratifiedContext.entrySet() ) {
final boolean matchesRef = ref.equals(alleleBin.getKey());
final boolean matchesAlt = alts.contains(alleleBin.getKey());
if ( !matchesRef && !matchesAlt )
continue;
for ( final GATKSAMRecord read : alleleBin.getValue() ) {
if ( matchesRef )
refQuals.add((double)read.getMappingQuality());
else
altQuals.add((double)read.getMappingQuality());
}
}
}
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ?
HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();

View File

@ -30,14 +30,9 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA
int mq0 = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup != null) {
final AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;

View File

@ -53,14 +53,8 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation {
return null;
int mq0 = 0;
ReadBackedPileup pileup = null;
if (vc.isIndel() && context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
else return null;
if (pileup != null) {
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;

View File

@ -31,13 +31,8 @@ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements E
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
depth += context.size();
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup != null) {
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;

View File

@ -3,11 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -23,7 +26,7 @@ import java.util.Map;
* Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing
* reads associated with the samples with polymorphic genotypes.
*/
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation {
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
@ -45,7 +48,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
if ( context == null )
continue;
depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : context.getExtendedEventPileup().depthOfCoverage();
depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : 0;
}
if ( depth == 0 )
@ -62,4 +65,40 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); }
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
final GenotypesContext genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() == 0 )
return null;
int depth = 0;
for ( final Genotype genotype : genotypes ) {
// we care only about variant calls with likelihoods
if ( !genotype.isHet() && !genotype.isHomVar() )
continue;
final Map<Allele, List<GATKSAMRecord>> alleleBins = stratifiedContexts.get(genotype.getSampleName());
if ( alleleBins == null )
continue;
for ( final Map.Entry<Allele, List<GATKSAMRecord>> alleleBin : alleleBins.entrySet() ) {
if ( !alleleBin.getKey().equals(Allele.NO_CALL) )
depth += alleleBin.getValue().size();
}
}
if ( depth == 0 )
return null;
double QD = -10.0 * vc.getLog10PError() / (double)depth;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", QD));
return map;
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
@ -13,6 +14,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
@ -24,7 +27,7 @@ import java.util.Map;
/**
* Root Mean Square of the mapping quality of the reads across all samples.
*/
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation {
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
@ -34,18 +37,13 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
for ( AlignmentContext context : stratifiedContexts.values() )
totalSize += context.size();
int[] qualities = new int[totalSize];
final int[] qualities = new int[totalSize];
int index = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup != null) {
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE )
qualities[index++] = p.getMappingQual();
@ -59,6 +57,34 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
return map;
}
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
if ( stratifiedContexts.size() == 0 )
return null;
int depth = 0;
for ( final Map<Allele, List<GATKSAMRecord>> alleleBins : stratifiedContexts.values() ) {
for ( final Map.Entry<Allele, List<GATKSAMRecord>> alleleBin : alleleBins.entrySet() ) {
depth += alleleBin.getValue().size();
}
}
final int[] qualities = new int[depth];
int index = 0;
for ( final Map<Allele, List<GATKSAMRecord>> alleleBins : stratifiedContexts.values() ) {
for ( final List<GATKSAMRecord> reads : alleleBins.values() ) {
for ( final GATKSAMRecord read : reads ) {
if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE )
qualities[index++] = read.getMappingQuality();
}
}
}
final Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", MathUtils.rms(qualities)));
return map;
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); }

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
@ -12,6 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
@ -26,7 +28,7 @@ import java.util.Map;
/**
* Abstract root for all RankSum based annotations
*/
public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation {
public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
static final double INDEL_LIKELIHOOD_THRESH = 0.1;
static final boolean DEBUG = false;
@ -38,7 +40,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
if (genotypes == null || genotypes.size() == 0)
return null;
final ArrayList<Double> refQuals = new ArrayList<Double>();
final ArrayList<Double> altQuals = new ArrayList<Double>();
@ -62,12 +63,10 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
continue;
}
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (!context.hasBasePileup())
continue;
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup == null)
continue;
@ -106,12 +105,52 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
if (!Double.isNaN(testResults.first))
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
return map;
}
protected abstract void fillQualsFromPileup(byte ref, List<Byte> alts, ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
public Map<String, Object> annotate(Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
if (stratifiedContexts.size() == 0)
return null;
protected abstract void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals);
final GenotypesContext genotypes = vc.getGenotypes();
if (genotypes == null || genotypes.size() == 0)
return null;
final ArrayList<Double> refQuals = new ArrayList<Double>();
final ArrayList<Double> altQuals = new ArrayList<Double>();
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
final Map<Allele, List<GATKSAMRecord>> context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null )
continue;
fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), vc.getStart(), context, refQuals, altQuals);
}
if ( refQuals.size() == 0 || altQuals.size() == 0 )
return null;
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
for (final Double qual : altQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
}
for (final Double qual : refQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final Pair<Double, Double> testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
final Map<String, Object> map = new HashMap<String, Object>();
if (!Double.isNaN(testResults.first))
map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
return map;
}
protected abstract void fillQualsFromPileup(final Allele ref, final List<Allele> alts, final int refLoc, final Map<Allele, List<GATKSAMRecord>> stratifiedContext, final List<Double> refQuals, List<Double> altQuals);
protected abstract void fillQualsFromPileup(final byte ref, final List<Byte> alts, final ReadBackedPileup pileup, final List<Double> refQuals, final List<Double> altQuals);
protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List<Double> refQuals, final List<Double> altQuals);
protected static boolean isUsableBase(final PileupElement p) {
return !(p.isInsertionAtBeginningOfRead() ||

View File

@ -1,207 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Unsupported
*/
@Hidden
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
private static String REF_ALLELE = "REF";
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() ) return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlternateAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
int totalDepth = pileup.getNumberOfElements();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
int mq0 = 0; // number of "ref" reads that are acually mq0
for ( PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
// we need to add counts in the correct order
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
}
map.put(getKeyNames().get(1), fracs);
return map;
}
private Map<String,Object> annotateIndel(AlignmentContext
stratifiedContext, VariantContext
vc) {
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
return null;
}
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
if ( pileup == null )
return null;
int totalDepth = pileup.getNumberOfElements();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map;
int mq0 = 0; // number of "ref" reads that are acually mq0
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
Allele refAllele = vc.getReference();
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( allele.isNoCall() ) {
continue; // this does not look so good, should we die???
}
alleleCounts.put(getAlleleRepresentation(allele), 0);
}
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
if ( e.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( e.isInsertion() ) {
final String b = e.getEventBases();
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
} else {
if ( e.isDeletion() ) {
if ( e.getEventLength() == refAllele.length() ) {
// this is indeed the deletion allele recorded in VC
final String b = DEL;
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
}
// else {
// System.out.print(" deletion of WRONG length found");
// }
}
}
}
if ( mq0 == totalDepth ) return map;
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
fracs[i] = String.format("%.3f",
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
map.put(getKeyNames().get(1), fracs);
//map.put(getKeyNames().get(0), counts);
return map;
}
private String getAlleleRepresentation(Allele allele) {
if ( allele.isNull() ) { // deletion wrt the ref
return DEL;
} else { // insertion, pass actual bases
return allele.getBaseString();
}
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
1,
VCFHeaderLineType.Integer,
"Total read depth per sample, including MQ0"),
new VCFFormatHeaderLine(getKeyNames().get(1),
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.Float,
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
}
}

View File

@ -11,15 +11,14 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.*;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
* Note that the read position rank sum test can not be calculated for homozygous sites.
*/
public class ReadPosRankSumTest extends RankSumTest {
@ -49,6 +48,31 @@ public class ReadPosRankSumTest extends RankSumTest {
}
}
protected void fillQualsFromPileup(final Allele ref, final List<Allele> alts, final int refLoc, final Map<Allele, List<GATKSAMRecord>> stratifiedContext, final List<Double> refQuals, final List<Double> altQuals) {
for ( final Map.Entry<Allele, List<GATKSAMRecord>> alleleBin : stratifiedContext.entrySet() ) {
final boolean matchesRef = ref.equals(alleleBin.getKey());
final boolean matchesAlt = alts.contains(alleleBin.getKey());
if ( !matchesRef && !matchesAlt )
continue;
for ( final GATKSAMRecord read : alleleBin.getValue() ) {
final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getUnclippedStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true );
if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
continue;
int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 );
final int numAlignedBases = AlignmentUtils.getNumAlignedBases( read );
if (readPos > numAlignedBases / 2)
readPos = numAlignedBases - (readPos + 1);
if ( matchesRef )
refQuals.add((double) readPos);
else
altQuals.add((double) readPos);
}
}
}
protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List<Double> refQuals, List<Double> altQuals) {
// equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele
// to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element.

View File

@ -35,13 +35,8 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup != null) {
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
deletions += pileup.getNumberOfDeletions();
depth += pileup.getNumberOfElements();
}

View File

@ -39,15 +39,9 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (pileup != null) {
for (PileupElement p : pileup ) {
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for ( PileupElement p : pileup ) {
if(ReadUtils.is454Read(p.getRead()))
reads454++;
else if (ReadUtils.isSOLiDRead(p.getRead()))

View File

@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
@ -168,20 +167,14 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
@Hidden
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
@Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio")
public double minGenotypeQualityP = 0.0;
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false)
private boolean requireStrictAlleleMatch = false;
protected boolean requireStrictAlleleMatch = false;
private VariantAnnotatorEngine engine;
private Collection<VariantContext> indelBufferContext;
private void listAnnotationsAndExit() {
System.out.println("\nStandard annotations in the list below are marked with a '*'.");
@ -240,7 +233,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
if ( infoline.getName().equals(expression.fieldName) ) {
if ( infoline.getID().equals(expression.fieldName) ) {
targetHeaderLine = infoline;
break;
}
@ -261,10 +254,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
if ( indelsOnly ) {
indelBufferContext = null;
}
}
public static boolean isUniqueHeaderLine(VCFHeaderLine line, Set<VCFHeaderLine> currentSet) {
@ -294,13 +283,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
*/
public boolean includeReadsWithDeletionAtLoci() { return true; }
/**
* We want to see extended events if annotating indels
*
* @return true
*/
public boolean generateExtendedEvents() { return indelsOnly; }
/**
* For each site of interest, annotate based on the requested annotation types
*
@ -322,31 +304,16 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
// if the reference base is not ambiguous, we can annotate
Map<String, AlignmentContext> stratifiedContexts;
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
if ( ! context.hasExtendedEventPileup() ) {
if ( context.hasBasePileup() ) {
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
} else {
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup());
}
if ( stratifiedContexts != null ) {
annotatedVCs = new ArrayList<VariantContext>(VCs.size());
for ( VariantContext vc : VCs )
annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
}
}
if ( ! indelsOnly ) {
for ( VariantContext annotatedVC : annotatedVCs )
vcfWriter.add(annotatedVC);
} else {
// check to see if the buffered context is different (in location) this context
if ( indelBufferContext != null && ! VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),indelBufferContext.iterator().next()).equals(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),annotatedVCs.iterator().next())) ) {
for ( VariantContext annotatedVC : indelBufferContext )
vcfWriter.add(annotatedVC);
indelBufferContext = annotatedVCs;
} else {
indelBufferContext = annotatedVCs;
}
}
for ( VariantContext annotatedVC : annotatedVCs )
vcfWriter.add(annotatedVC);
return 1;
}

View File

@ -33,10 +33,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
@ -94,6 +92,13 @@ public class VariantAnnotatorEngine {
initializeDBs();
}
// experimental constructor for active region traversal
public VariantAnnotatorEngine(GenomeAnalysisEngine toolkit) {
this.walker = null;
this.toolkit = toolkit;
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(Arrays.asList("ActiveRegionBasedAnnotation"), Collections.<String>emptyList());
}
// select specific expressions to use
public void initializeExpressions(List<String> expressionsToUse) {
// set up the expressions
@ -169,7 +174,7 @@ public class VariantAnnotatorEngine {
this.requireStrictAlleleMatch = requireStrictAlleleMatch;
}
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
// annotate db occurrences
@ -192,6 +197,20 @@ public class VariantAnnotatorEngine {
return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make();
}
public VariantContext annotateContext(final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, VariantContext vc) {
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
// go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
Map<String, Object> annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc);
if ( annotationsFromCurrentType != null )
infoAnnotations.putAll(annotationsFromCurrentType);
}
// generate a new annotated VC
return new VariantContextBuilder(vc).attributes(infoAnnotations).make();
}
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {

View File

@ -0,0 +1,18 @@
package org.broadinstitute.sting.gatk.walkers.annotator.interfaces;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
import java.util.Map;
// TODO -- make this an abstract class when we move away from InfoFieldAnnotation
public interface ActiveRegionBasedAnnotation extends AnnotationType {
// return annotations for the given contexts split by sample and then allele
public abstract Map<String, Object> annotate(final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedContexts, final VariantContext vc);
// return the descriptions used for the VCF INFO meta field
public abstract List<VCFInfoHeaderLine> getDescriptions();
}

View File

@ -0,0 +1,83 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.List;
/**
* User: carneiro
* Date: 3/29/11
*/
public class BQSRGatherer extends Gatherer {
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
private static final String MISSING_OUTPUT_FILE = "missing output file name";
@Override
public void gather(List<File> inputs, File output) {
RecalibrationReport generalReport = null;
PrintStream outputFile;
try {
outputFile = new PrintStream(output);
} catch(FileNotFoundException e) {
throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
}
for (File input : inputs) {
RecalibrationReport inputReport = new RecalibrationReport(input);
if (generalReport == null)
generalReport = inputReport;
else
generalReport.combine(inputReport);
}
if (generalReport == null)
throw new ReviewedStingException(EMPTY_INPUT_LIST);
generalReport.calculateEmpiricalAndQuantizedQualities();
RecalibrationArgumentCollection RAC = generalReport.getRAC();
if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) {
File recal_out = new File(output.getName() + ".original");
RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport);
RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getKeysAndTablesMap(), generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
}
else if (!RAC.NO_PLOTS) {
File recal_out = new File(output.getName() + ".recal");
RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getKeysAndTablesMap(), RAC.KEEP_INTERMEDIATE_FILES);
}
generalReport.output(outputFile);
}
}

View File

@ -0,0 +1,341 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import java.util.*;
/**
* This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR
*
* It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will
* add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information
* out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type.
*
* The keys represented by this key manager will always have the same order:
*
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType
* ...
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType
*
*
* Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary.
*
* @author Mauricio Carneiro
* @since 3/6/12
*/
public class BQSRKeyManager {
private final List<RequiredCovariateInfo> requiredCovariates;
private final List<OptionalCovariateInfo> optionalCovariates;
private final Map<String, Short> covariateNameToIDMap;
private int nRequiredBits; // Number of bits used to represent the required covariates
private int nOptionalBits; // Number of bits used to represent the standard covaraites
private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
private final int totalNumberOfBits; // Sum of all of the above plus the event bits
private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
/**
* Initializes the KeyManager with the total number of covariates to use
*
* @param requiredCovariates the ordered list of required covariates
* @param optionalCovariates the ordered list of optional covariates
*/
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
this.covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
nRequiredBits = 0;
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
int nBits = required.numberOfBits(); // number of bits used by this covariate
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate
nRequiredBits += nBits;
}
short id = 0;
nOptionalBits = 0;
for (Covariate optional : optionalCovariates) {
int nBits = optional.numberOfBits(); // number of bits used by this covariate
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
this.covariateNameToIDMap.put(covariateName, id);
id++;
}
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
}
/**
* Generates one key per optional covariate.
*
* Keys include all required covariates, the standard covariate and the event type.
*
* Example allKeys:
* RG, QUAL, CYCLE, CONTEXT
*
* List of BitSets returned by this example (given eventType):
* RG, QUAL, CYCLE, EVENT
* RG, QUAL, CONTEXT, EVENT
*
* Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type
*
* @param allKeys The keys in bitset representation for each covariate
* @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
* @return one key in bitset representation per covariate
*/
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
int covariateIndex = 0;
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
for (RequiredCovariateInfo infoRequired : requiredCovariates)
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
for (OptionalCovariateInfo infoOptional : optionalCovariates) {
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
if (covariateKey == null)
continue; // do not add nulls to the final set of keys.
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
optionalKey.or(requiredKey); // import all the required covariates
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(optionalKey); // add this key to the list of keys
}
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(requiredKey); // add this key to the list of keys
}
return allBitSets;
}
/**
* Generates one bitset key for the covariates represented in Object[] key
*
* The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file)
* and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many.
*
* Example key:
* RG, QUAL, CYCLE, CYCLE_ID, EventType
*
* @param key list of objects produced by the required covariates followed by one or zero optional covariates.
* @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface.
*/
public BitSet bitSetFromKey(Object[] key) {
BitSet bitSetKey = new BitSet(totalNumberOfBits);
int requiredCovariate = 0;
for (RequiredCovariateInfo infoRequired : requiredCovariates) {
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
}
if (optionalCovariates.size() > 0) {
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
}
int eventIndex = key.length - 1; // the event type is always the last key
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
return bitSetKey;
}
/**
* Covariate id can be either the covariate name (String) or the actual id (short). This method
* finds it's type and converts accordingly to the short notation.
*
* @param id the string or short representation of the optional covariate id
* @return the short representation of the optional covariate id.
*/
private short parseCovariateID(Object id) {
return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id;
}
/**
* Generates a key set of objects from a combined bitset key.
*
* Masks out each covariate independently and decodes their values (Object) into a keyset
*
* @param key the bitset representation of the keys
* @return an object array with the values for each key
*/
public List<Object> keySetFrom(BitSet key) {
List<Object> objectKeys = new ArrayList<Object>();
for (RequiredCovariateInfo info : requiredCovariates) {
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
}
if (optionalCovariates.size() > 0) {
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is)
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
}
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
return objectKeys;
}
public List<Covariate> getRequiredCovariates() {
ArrayList<Covariate> list = new ArrayList<Covariate>(requiredCovariates.size());
for (RequiredCovariateInfo info : requiredCovariates)
list.add(info.covariate);
return list;
}
public List<Covariate> getOptionalCovariates() {
ArrayList<Covariate> list = new ArrayList<Covariate>(optionalCovariates.size());
for (OptionalCovariateInfo info : optionalCovariates)
list.add(info.covariate);
return list;
}
/**
* Translates a masked bitset into a bitset starting at 0
*
* @param key the masked out bitset
* @param n the number of bits to chop
* @return a translated bitset starting at 0 for the covariate machinery to decode
*/
private BitSet chopNBitsFrom(BitSet key, int n) {
BitSet choppedKey = new BitSet();
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
return choppedKey;
}
/**
* Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key
*
* @param leadingBits the index of the covariate in the ordered covariate list
* @param nBits the number of bits needed by the Covariate to represent its values in BitSet form
* @return the bitset relevant to the covariate
*/
private BitSet genericMask(int leadingBits, int nBits) {
BitSet mask = new BitSet(leadingBits + nBits);
mask.set(leadingBits, leadingBits + nBits);
return mask;
}
/**
* Decodes the event type (enum) from the full bitset key
*
* @param fullKey the full key of all covariates + event type
* @return the decoded event type.
*/
private EventType eventFromBitSet(BitSet fullKey) {
BitSet eventKey = new BitSet();
int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits;
for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1))
eventKey.set(i - firstBitIndex);
return EventType.eventFrom(BitSetUtils.shortFrom(eventKey));
}
private BitSet bitSetFromEvent(EventType eventType) {
return BitSetUtils.bitSetFrom(eventType.index);
}
private int bitsInEventType() {
return BitSetUtils.numberOfBitsToRepresent(EventType.values().length);
}
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
}
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
BitSet bitSet = (BitSet) key.clone();
bitSet.and(mask);
return chopNBitsFrom(bitSet, leadingBits);
}
@Override
public boolean equals(Object o) {
if (!(o instanceof BQSRKeyManager))
return false;
BQSRKeyManager other = (BQSRKeyManager) o;
if (this == other)
return true;
if (requiredCovariates.size() != other.requiredCovariates.size() || optionalCovariates.size() != other.optionalCovariates.size())
return false;
Iterator<RequiredCovariateInfo> otherRequiredIterator = other.requiredCovariates.iterator();
for (RequiredCovariateInfo thisInfo: requiredCovariates) {
RequiredCovariateInfo otherInfo = otherRequiredIterator.next();
String thisName = thisInfo.covariate.getClass().getSimpleName();
String otherName = otherInfo.covariate.getClass().getSimpleName();
if (!thisName.equals(otherName))
return false;
}
Iterator<OptionalCovariateInfo> otherOptionalIterator = other.optionalCovariates.iterator();
for (OptionalCovariateInfo thisInfo : optionalCovariates) {
OptionalCovariateInfo otherInfo = otherOptionalIterator.next();
String thisName = thisInfo.covariate.getClass().getSimpleName();
String otherName = otherInfo.covariate.getClass().getSimpleName();
if (!thisName.equals(otherName))
return false;
}
return true;
}
/**
* Aggregate information for each Covariate
*/
class RequiredCovariateInfo {
public final int bitsBefore; // number of bits before this covariate in the combined bitset key
public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
public final Covariate covariate; // this allows reverse lookup of the Covariates in order
RequiredCovariateInfo(int bitsBefore, BitSet mask, Covariate covariate) {
this.bitsBefore = bitsBefore;
this.mask = mask;
this.covariate = covariate;
}
}
class OptionalCovariateInfo {
public final BitSet covariateID; // cache the covariate ID
public final Covariate covariate;
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
this.covariateID = covariateID;
this.covariate = covariate;
}
}
}

View File

@ -26,7 +26,9 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -43,7 +45,9 @@ public class ContextCovariate implements StandardCovariate {
private int mismatchesContextSize;
private int insertionsContextSize;
private int deletionsContextSize;
private int deletionsContextSize;
private byte LOW_QUAL_TAIL;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
@ -52,18 +56,22 @@ public class ContextCovariate implements StandardCovariate {
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
public CovariateValues getValues(GATKSAMRecord read) {
int l = read.getReadLength();
BitSet[] mismatches = new BitSet[l];
BitSet[] insertions = new BitSet[l];
BitSet[] deletions = new BitSet[l];
BitSet[] deletions = new BitSet[l];
read = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
final boolean negativeStrand = read.getReadNegativeStrandFlag();
byte[] bases = read.getReadBases();
if (negativeStrand)
@ -72,7 +80,7 @@ public class ContextCovariate implements StandardCovariate {
for (int i = 0; i < read.getReadLength(); i++) {
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
insertions[i] = contextWith(bases, i, insertionsContextSize);
deletions[i] = contextWith(bases, i, deletionsContextSize);
deletions[i] = contextWith(bases, i, deletionsContextSize);
}
if (negativeStrand) {
@ -89,24 +97,41 @@ public class ContextCovariate implements StandardCovariate {
return str;
}
@Override
public String keyFromBitSet(BitSet key) {
if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file
return null;
return BitSetUtils.dnaFrom(key);
}
@Override
public BitSet bitSetFromKey(Object key) {
return BitSetUtils.bitSetFrom((String) key);
}
@Override
public int numberOfBits() {
return Long.bitCount(-1L);
}
/**
* calculates the context of a base independent of the covariate mode
* calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion)
*
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context
* @return
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context
* @return the bitSet representing the Context
*/
private BitSet contextWith(byte [] bases, int offset, int contextSize) {
if (offset < contextSize)
return null;
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
if (context.contains("N"))
return null;
return MathUtils.bitSetFrom(context);
}
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
BitSet result = null;
if (offset - contextSize + 1 >= 0) {
String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1));
if (!context.contains("N"))
result = BitSetUtils.bitSetFrom(context);
}
return result;
}
/**
* Reverses the given array in place.

View File

@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -53,7 +55,40 @@ public interface Covariate {
*/
public CovariateValues getValues(GATKSAMRecord read);
public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
/**
* Used to get the covariate's value from input csv file during on-the-fly recalibration
*
* @param str the key in string type (read from the csv)
* @return the key in it's correct type.
*/
public Object getValue(String str);
/**
* Converts the bitset representation of the key (used internally for table indexing) to String format for file output.
*
* @param key the bitset representation of the key
* @return a string representation of the key
*/
public String keyFromBitSet(BitSet key);
/**
* Converts a key into a bitset
*
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates
* the getValues method already returns all values in BitSet format.
*
* @param key the object corresponding to the covariate
* @return a bitset representation of the object
*/
public BitSet bitSetFromKey(Object key);
/**
* Each covariate should determine how many bits are necessary to encode it's data
*
* @return The number of bits used to represent the values of this covariate.
*/
public int numberOfBits();
}
interface RequiredCovariate extends Covariate {}

View File

@ -1,88 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* The object temporarily held by a read that describes all of it's covariates.
*
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class CovariateKeySet {
private Object[][] mismatchesKeySet;
private Object[][] insertionsKeySet;
private Object[][] deletionsKeySet;
private int nextCovariateIndex;
private static String mismatchesCovariateName = "M";
private static String insertionsCovariateName = "I";
private static String deletionsCovariateName = "D";
public CovariateKeySet(int readLength, int numberOfCovariates) {
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName);
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName);
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName);
this.nextCovariateIndex = 0;
}
public void addCovariate(CovariateValues covariate) {
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
nextCovariateIndex++;
}
public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) {
if (modelString.equals(mismatchesCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION;
else if (modelString.equals(insertionsCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_INSERTION;
else if (modelString.equals(deletionsCovariateName))
return RecalDataManager.BaseRecalibrationType.BASE_DELETION;
throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString);
}
public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) {
switch (errorModel) {
case BASE_SUBSTITUTION:
return getMismatchesKeySet(readPosition);
case BASE_INSERTION:
return getInsertionsKeySet(readPosition);
case BASE_DELETION:
return getDeletionsKeySet(readPosition);
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
}
public Object[] getMismatchesKeySet(int readPosition) {
return mismatchesKeySet[readPosition];
}
public Object[] getInsertionsKeySet(int readPosition) {
return insertionsKeySet[readPosition];
}
public Object[] getDeletionsKeySet(int readPosition) {
return deletionsKeySet[readPosition];
}
private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) {
for (int i=0; i<covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i];
}
private void initializeCovariateKeySet (Object[][] keySet, String covariateName) {
int readLength = keySet.length;
int lastCovariateIndex = keySet[0].length - 1;
for (int i = 0; i < readLength; i++)
keySet[i][lastCovariateIndex] = covariateName;
}
}

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import java.util.BitSet;
/**
* An object to hold the different covariate values for all bases in the read.
*
@ -12,25 +14,25 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* @since 2/8/12
*/
public class CovariateValues {
private Object[] mismatches;
private Object[] insertions;
private Object[] deletions;
private final BitSet[] mismatches;
private final BitSet[] insertions;
private final BitSet[] deletions;
public CovariateValues(Object[] mismatch, Object[] insertion, Object[] deletion) {
public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) {
this.mismatches = mismatch;
this.insertions = insertion;
this.deletions = deletion;
}
public Object[] getMismatches() {
public BitSet[] getMismatches() {
return mismatches;
}
public Object[] getInsertions() {
public BitSet[] getInsertions() {
return insertions;
}
public Object[] getDeletions() {
public BitSet[] getDeletions() {
return deletions;
}

View File

@ -1,10 +1,12 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
import java.util.EnumSet;
/*
@ -59,48 +61,27 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
Integer [] cycles = new Integer[read.getReadLength()];
BitSet[] cycles = new BitSet[read.getReadLength()];
final NGSPlatform ngsPlatform = read.getNGSPlatform();
// Discrete cycle platforms
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final int init;
final int increment;
if (!read.getReadNegativeStrandFlag()) {
// Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
// Therefore the cycle covariate must differentiate between first and second of pair reads.
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
// the current sequential model would consider the effects independently instead of jointly.
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
//second of pair, positive strand
init = -1;
increment = -1;
}
else {
//first of pair, positive strand
init = 1;
increment = 1;
}
final short readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? (short) -1 : 1;
final short increment;
short cycle;
if (read.getReadNegativeStrandFlag()) {
cycle = (short) (read.getReadLength() * readOrderFactor);
increment = (short) (-1 * readOrderFactor);
}
else {
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
//second of pair, negative strand
init = -read.getReadLength();
increment = 1;
}
else {
//first of pair, negative strand
init = read.getReadLength();
increment = -1;
}
cycle = readOrderFactor;
increment = readOrderFactor;
}
int cycle = init;
for (int i = 0; i < read.getReadLength(); i++) {
cycles[i] = cycle;
final int CUSHION = 4;
final int MAX_CYCLE = read.getReadLength() - CUSHION - 1;
for (int i = 0; i < MAX_CYCLE; i++) {
cycles[i] = (i<CUSHION || i>MAX_CYCLE) ? null : BitSetUtils.bitSetFrom(cycle);
cycle += increment;
}
}
@ -119,7 +100,7 @@ public class CycleCovariate implements StandardCovariate {
// the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
int cycle = multiplyByNegative1 ? -1 : 1;
short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
@ -127,19 +108,19 @@ public class CycleCovariate implements StandardCovariate {
int iii = 0;
while (iii < readLength) {
while (iii < readLength && bases[iii] == (byte) 'T') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'A') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'C') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'G') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++;
}
if (iii < readLength) {
@ -149,7 +130,7 @@ public class CycleCovariate implements StandardCovariate {
cycle++;
}
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii++;
}
@ -159,19 +140,19 @@ public class CycleCovariate implements StandardCovariate {
int iii = readLength - 1;
while (iii >= 0) {
while (iii >= 0 && bases[iii] == (byte) 'T') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'A') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'C') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'G') {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--;
}
if (iii >= 0) {
@ -181,7 +162,7 @@ public class CycleCovariate implements StandardCovariate {
cycle++;
}
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = cycle;
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
iii--;
}
}
@ -192,13 +173,28 @@ public class CycleCovariate implements StandardCovariate {
else {
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
}
return new CovariateValues(cycles, cycles, cycles);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return Integer.parseInt(str);
return Short.parseShort(str);
}
@Override
public String keyFromBitSet(BitSet key) {
return String.format("%d", BitSetUtils.shortFrom(key));
}
@Override
public BitSet bitSetFromKey(Object key) {
return (key instanceof String) ? BitSetUtils.bitSetFrom(Short.parseShort((String) key)) : BitSetUtils.bitSetFrom((Short) key);
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative
}
}

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.List;
/*
* Copyright (c) 2010 The Broad Institute
*
@ -38,10 +36,13 @@ import java.util.List;
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
*/
public class RecalDatumOptimized {
public class Datum {
long numObservations; // number of bases seen in total
long numMismatches; // number of bases seen that didn't match the reference
private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero
protected long numObservations; // number of bases seen in total
protected long numMismatches; // number of bases seen that didn't match the reference
//---------------------------------------------------------------------------------------------------------------
//
@ -49,19 +50,14 @@ public class RecalDatumOptimized {
//
//---------------------------------------------------------------------------------------------------------------
public RecalDatumOptimized() {
public Datum() {
numObservations = 0L;
numMismatches = 0L;
}
public RecalDatumOptimized(final long _numObservations, final long _numMismatches) {
numObservations = _numObservations;
numMismatches = _numMismatches;
}
public RecalDatumOptimized(final RecalDatumOptimized copy) {
this.numObservations = copy.numObservations;
this.numMismatches = copy.numMismatches;
public Datum(long numObservations, long numMismatches) {
this.numObservations = numObservations;
this.numMismatches = numMismatches;
}
//---------------------------------------------------------------------------------------------------------------
@ -70,46 +66,40 @@ public class RecalDatumOptimized {
//
//---------------------------------------------------------------------------------------------------------------
public synchronized final void increment(final long incObservations, final long incMismatches) {
synchronized void increment(final long incObservations, final long incMismatches) {
numObservations += incObservations;
numMismatches += incMismatches;
}
public synchronized final void increment(final RecalDatumOptimized other) {
increment(other.numObservations, other.numMismatches);
}
public synchronized final void increment(final List<RecalDatumOptimized> data) {
for (RecalDatumOptimized other : data) {
this.increment(other);
}
}
//---------------------------------------------------------------------------------------------------------------
//
// methods to derive empirical quality score
//
//---------------------------------------------------------------------------------------------------------------
public final double empiricalQualDouble(final int smoothing, final double maxQual) {
final double doubleMismatches = (double) (numMismatches + smoothing);
final double doubleObservations = (double) (numObservations + smoothing);
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
return Math.min(empiricalQual, maxQual);
double empiricalQualDouble() {
final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT);
final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT);
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
return Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
}
public final byte empiricalQualByte(final int smoothing) {
final double doubleMismatches = (double) (numMismatches + smoothing);
final double doubleObservations = (double) (numObservations + smoothing);
return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40
byte empiricalQualByte() {
final double doubleMismatches = (double) (numMismatches);
final double doubleObservations = (double) (numObservations);
return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40
}
public final byte empiricalQualByte() {
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
}
public final String outputToCSV() {
@Override
public String toString() {
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
}
@Override
public boolean equals(Object o) {
if (!(o instanceof Datum))
return false;
Datum other = (Datum) o;
return numMismatches == other.numMismatches && numObservations == other.numObservations;
}
}

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
public enum EventType {
BASE_SUBSTITUTION(0, "M"),
BASE_INSERTION(1, "I"),
BASE_DELETION(2, "D");
public final int index;
private final String representation;
private EventType(int index, String representation) {
this.index = index;
this.representation = representation;
}
public static EventType eventFrom(int index) {
switch (index) {
case 0:
return BASE_SUBSTITUTION;
case 1:
return BASE_INSERTION;
case 2:
return BASE_DELETION;
default:
throw new ReviewedStingException(String.format("Event %d does not exist.", index));
}
}
public static EventType eventFrom(String event) {
for (EventType eventType : EventType.values())
if (eventType.representation.equals(event))
return eventType;
throw new ReviewedStingException(String.format("Event %s does not exist.", event));
}
@Override
public String toString() {
return representation;
}
}

View File

@ -1,7 +1,11 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -46,18 +50,18 @@ public class QualityScoreCovariate implements RequiredCovariate {
public CovariateValues getValues(final GATKSAMRecord read) {
int readLength = read.getReadLength();
Integer [] mismatches = new Integer[readLength];
Integer [] insertions = new Integer[readLength];
Integer [] deletions = new Integer[readLength];
BitSet[] mismatches = new BitSet[readLength];
BitSet[] insertions = new BitSet[readLength];
BitSet[] deletions = new BitSet[readLength];
byte [] baseQualities = read.getBaseQualities();
byte [] baseInsertionQualities = read.getBaseInsertionQualities();
byte [] baseDeletionQualities = read.getBaseDeletionQualities();
byte[] baseQualities = read.getBaseQualities();
byte[] baseInsertionQualities = read.getBaseInsertionQualities();
byte[] baseDeletionQualities = read.getBaseDeletionQualities();
for (int i=0; i<baseQualities.length; i++) {
mismatches[i] = (int) baseQualities[i];
insertions[i] = (int) baseInsertionQualities[i];
deletions[i] = (int) baseDeletionQualities[i];
for (int i = 0; i < baseQualities.length; i++) {
mismatches[i] = BitSetUtils.bitSetFrom(baseQualities[i]);
insertions[i] = BitSetUtils.bitSetFrom(baseInsertionQualities[i]);
deletions[i] = BitSetUtils.bitSetFrom(baseDeletionQualities[i]);
}
return new CovariateValues(mismatches, insertions, deletions);
@ -66,6 +70,21 @@ public class QualityScoreCovariate implements RequiredCovariate {
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return Integer.parseInt(str);
return Byte.parseByte(str);
}
@Override
public String keyFromBitSet(BitSet key) {
return String.format("%d", BitSetUtils.longFrom(key));
}
@Override
public BitSet bitSetFromKey(Object key) {
return (key instanceof String) ? BitSetUtils.bitSetFrom(Byte.parseByte((String) key)) : BitSetUtils.bitSetFrom((Byte) key);
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE);
}
}

View File

@ -0,0 +1,104 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Map;
/**
* Class that encapsulates the information necessary for quality score quantization for BQSR
*
* @author carneiro
* @since 3/26/12
*/
public class QuantizationInfo {
private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts;
private int quantizationLevels;
private QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts;
this.quantizationLevels = quantizationLevels;
}
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
}
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
Map<BitSet, RecalDatum> qualTable = null; // look for the quality score table
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites
qualTable = entry.getValue();
}
if (qualTable == null)
throw new ReviewedStingException("Could not find QualityScore table.");
for (RecalDatum datum : qualTable.values()) {
int empiricalQual = (int) Math.round(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL )
long nObservations = datum.numObservations;
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
}
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(quantizationLevels);
this.quantizationLevels = quantizationLevels;
}
public void quantizeQualityScores(int nLevels) {
QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
}
public void noQuantization() {
this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE;
for (int i = 0; i < this.quantizationLevels; i++)
quantizedQuals.set(i, (byte) i);
}
public List<Byte> getQuantizedQuals() {
return quantizedQuals;
}
public int getQuantizationLevels() {
return quantizationLevels;
}
public GATKReportTable generateReportTable() {
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
quantizedTable.addColumn(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, 0L);
quantizedTable.addColumn(RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, (byte) 0);
for (int qual = 0; qual <= QualityUtils.MAX_QUAL_SCORE; qual++) {
quantizedTable.set(qual, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual));
quantizedTable.set(qual, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual));
}
return quantizedTable;
}
private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
byte lastByte = -1;
int quantizationLevels = 0;
for (byte q : quantizedQuals) {
if (q != lastByte) {
quantizationLevels++;
lastByte = q;
}
}
return quantizationLevels;
}
}

View File

@ -0,0 +1,80 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.BitSet;
/**
* The object temporarily held by a read that describes all of it's covariates.
*
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class ReadCovariates {
private final BitSet[][] mismatchesKeySet;
private final BitSet[][] insertionsKeySet;
private final BitSet[][] deletionsKeySet;
private int nextCovariateIndex;
public ReadCovariates(int readLength, int numberOfCovariates) {
this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates];
this.insertionsKeySet = new BitSet[readLength][numberOfCovariates];
this.deletionsKeySet = new BitSet[readLength][numberOfCovariates];
this.nextCovariateIndex = 0;
}
public void addCovariate(CovariateValues covariate) {
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
nextCovariateIndex++;
}
public BitSet[] getKeySet(final int readPosition, final EventType errorModel) {
switch (errorModel) {
case BASE_SUBSTITUTION:
return getMismatchesKeySet(readPosition);
case BASE_INSERTION:
return getInsertionsKeySet(readPosition);
case BASE_DELETION:
return getDeletionsKeySet(readPosition);
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel);
}
}
public BitSet[] getMismatchesKeySet(int readPosition) {
return mismatchesKeySet[readPosition];
}
public BitSet[] getInsertionsKeySet(int readPosition) {
return insertionsKeySet[readPosition];
}
public BitSet[] getDeletionsKeySet(int readPosition) {
return deletionsKeySet[readPosition];
}
private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) {
for (int i = 0; i < covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i];
}
/**
* Testing routines
*/
protected BitSet[][] getMismatchesKeySet() {
return mismatchesKeySet;
}
protected BitSet[][] getInsertionsKeySet() {
return insertionsKeySet;
}
protected BitSet[][] getDeletionsKeySet() {
return deletionsKeySet;
}
}

View File

@ -1,8 +1,11 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
/*
@ -39,7 +42,7 @@ import java.util.HashMap;
*/
public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
private short nextId = 0;
@ -52,30 +55,61 @@ public class ReadGroupCovariate implements RequiredCovariate {
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
final int l = read.getReadLength();
final String readGroupId = read.getReadGroup().getReadGroupId();
final String readGroupId = readGroupValueFromRG(read.getReadGroup());
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
BitSet[] readGroups = new BitSet[l];
Arrays.fill(readGroups, rg);
return new CovariateValues(readGroups, readGroups, readGroups);
}
@Override
public final Object getValue(final String str) {
return str;
}
@Override
public String keyFromBitSet(BitSet key) {
return decodeReadGroup((short) BitSetUtils.longFrom(key));
}
@Override
public BitSet bitSetFromKey(Object key) {
return bitSetForReadGroup((String) key);
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
}
private String decodeReadGroup(final short id) {
return readGroupReverseLookupTable.get(id);
}
private BitSet bitSetForReadGroup(String readGroupId) {
short shortId;
if (readGroupLookupTable.containsKey(readGroupId))
if (readGroupLookupTable.containsKey(readGroupId))
shortId = readGroupLookupTable.get(readGroupId);
else {
shortId = nextId;
readGroupLookupTable.put(readGroupId, nextId);
readGroupReverseLookupTable.put(nextId, readGroupId);
nextId++;
}
Short [] readGroups = new Short[l];
Arrays.fill(readGroups, shortId);
return new CovariateValues(readGroups, readGroups, readGroups);
}
return BitSetUtils.bitSetFrom(shortId);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return str;
/**
* If the sample has a PU tag annotation, return that. If not, return the read group id.
*
* @param rg the read group record
* @return platform unit or readgroup id
*/
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) {
String platformUnit = rg.getPlatformUnit();
return platformUnit == null ? rg.getId() : platformUnit;
}
public final String decodeReadGroup(final short id) {
return readGroupReverseLookupTable.get(id);
}
}

View File

@ -25,22 +25,26 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.io.Resource;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* Created by IntelliJ IDEA.
@ -53,24 +57,31 @@ import java.util.Map;
*/
public class RecalDataManager {
public final NestedHashMap nestedHashMap; // The full dataset
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private final HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments";
public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized";
public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0";
public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1";
public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2";
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value";
public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore";
public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count";
public final static String READGROUP_COLUMN_NAME = "ReadGroup";
public final static String EVENT_TYPE_COLUMN_NAME = "EventType";
public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality";
public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported";
public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullPlatform = false;
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
private static final String SCRIPT_FILE = "BQSR.R";
public enum BaseRecalibrationType {
BASE_SUBSTITUTION,
BASE_INSERTION,
BASE_DELETION
}
public enum SOLID_RECAL_MODE {
/**
@ -88,7 +99,20 @@ public class RecalDataManager {
/**
* Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
*/
REMOVE_REF_BIAS
REMOVE_REF_BIAS;
public static SOLID_RECAL_MODE recalModeFromString(String recalMode) {
if (recalMode.equals("DO_NOTHING"))
return SOLID_RECAL_MODE.DO_NOTHING;
if (recalMode.equals("SET_Q_ZERO"))
return SOLID_RECAL_MODE.SET_Q_ZERO;
if (recalMode.equals("SET_Q_ZERO_BASE_N"))
return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N;
if (recalMode.equals("REMOVE_REF_BIAS"))
return SOLID_RECAL_MODE.REMOVE_REF_BIAS;
throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value");
}
}
public enum SOLID_NOCALL_STRATEGY {
@ -103,175 +127,348 @@ public class RecalDataManager {
/**
* Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
*/
PURGE_READ
}
PURGE_READ;
public RecalDataManager() {
nestedHashMap = new NestedHashMap();
dataCollapsedReadGroup = null;
dataCollapsedQualityScore = null;
dataCollapsedByCovariate = null;
}
public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) {
if (nocallStrategy.equals("THROW_EXCEPTION"))
return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED"))
return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED;
if (nocallStrategy.equals("PURGE_READ"))
return SOLID_NOCALL_STRATEGY.PURGE_READ;
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
nestedHashMap = null;
dataCollapsedReadGroup = new HashMap<BaseRecalibrationType, NestedHashMap>();
dataCollapsedQualityScore = new HashMap<BaseRecalibrationType, NestedHashMap>();
dataCollapsedByCovariate = new HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>>();
for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap());
}
}
}
else {
nestedHashMap = new NestedHashMap();
dataCollapsedReadGroup = null;
dataCollapsedQualityScore = null;
dataCollapsedByCovariate = null;
throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value");
}
}
public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) {
return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
}
/**
* Add the given mapping to all of the collapsed hash tables
* Initializes the recalibration table -> key manager map
*
* @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
* @param requiredCovariates list of required covariates (in order)
* @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized
*/
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) {
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
final int qualityScore = Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
final Object[] qualityScoreCollapsedKey = new Object[2];
final Object[] covariateCollapsedKey = new Object[3];
RecalDatum collapsedDatum;
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey);
}
else {
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
}
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
}
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
}
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
qualityScoreCollapsedKey[1] = key[1]; // and quality score
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
}
/**
* Generates two lists : required covariates and optional covariates based on the user's requests.
*
* Performs the following tasks in order:
* 1. Adds all requierd covariates in order
* 2. Check if the user asked to use the standard covariates and adds them all if that's the case
* 3. Adds all covariates requested by the user that were not already added by the two previous steps
*
* @param argumentCollection the argument collection object for the recalibration walker
* @return a pair of ordered lists : required covariates (first) and optional covariates (second)
*/
public static Pair<ArrayList<Covariate>, ArrayList<Covariate>> initializeCovariates(RecalibrationArgumentCollection argumentCollection) {
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) {
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
covariateCollapsedKey[1] = key[1]; // and quality score ...
final Object theCovariateElement = key[iii + 2]; // and the given covariate
if (theCovariateElement != null) {
covariateCollapsedKey[2] = theCovariateElement;
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>();
if (argumentCollection.USE_STANDARD_COVARIATES)
optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user
if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified
for (String requestedCovariateString : argumentCollection.COVARIATES) {
boolean foundClass = false;
for (Class<? extends Covariate> covClass : covariateClasses) {
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
foundClass = true;
if (!requiredClasses.contains(covClass) &&
(!argumentCollection.USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
try {
final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it
optionalCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
}
}
}
else {
collapsedDatum.increment(fullDatum);
if (!foundClass) {
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
}
}
}
return new Pair<ArrayList<Covariate>, ArrayList<Covariate>>(requiredCovariates, optionalCovariates);
}
public static void listAvailableCovariates(Logger logger) {
// Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
// Print and exit if that's what was requested
logger.info("Available covariates:");
for (Class<?> covClass : covariateClasses)
logger.info(covClass.getSimpleName());
logger.info("");
}
private static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
List<GATKReportTable> result = new LinkedList<GATKReportTable>();
int tableIndex = 0;
final Pair<String, String> covariateValue = new Pair<String, String>(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s");
final Pair<String, String> covariateName = new Pair<String, String>(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s");
final Pair<String, String> eventType = new Pair<String, String>(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s");
final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f");
final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f");
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
Map<BitSet, RecalDatum> recalTable = entry.getValue();
boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs.
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "");
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
for (Covariate covariate : requiredList) {
String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future
}
if (optionalList.size() > 0) {
columnNames.add(covariateValue);
columnNames.add(covariateName);
}
columnNames.add(eventType); // the order of these column names is important here
columnNames.add(empiricalQuality);
if (isReadGroupTable)
columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported
columnNames.add(nObservations);
columnNames.add(nErrors);
reportTable.addPrimaryKey("PrimaryKey", false); // every table must have a primary key (hidden)
for (Pair<String, String> columnName : columnNames)
reportTable.addColumn(columnName.getFirst(), true, columnName.getSecond()); // every table must have the event type
long primaryKey = 0L;
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
BitSet bitSetKey = recalTableEntry.getKey();
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
Iterator<Pair<String, String>> iterator = columnNames.iterator();
for (Object key : keyManager.keySetFrom(bitSetKey)) {
String columnName = iterator.next().getFirst();
columnData.put(columnName, key);
}
RecalDatum datum = recalTableEntry.getValue();
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality());
if (isReadGroupTable)
columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
columnData.put(iterator.next().getFirst(), datum.numObservations);
columnData.put(iterator.next().getFirst(), datum.numMismatches);
for (Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
String columnName = dataEntry.getKey();
Object value = dataEntry.getValue();
reportTable.set(primaryKey, columnName, value.toString());
}
primaryKey++;
}
result.add(reportTable);
}
return result;
}
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) {
GATKReport report = new GATKReport();
report.addTable(argumentTable);
report.addTable(quantizationTable);
report.addTables(recalTables);
report.print(outputFile);
}
private static Pair<PrintStream, File> initializeRecalibrationPlot(File filename) {
final PrintStream deltaTableStream;
final File deltaTableFileName = new File(filename + ".csv");
try {
deltaTableStream = new PrintStream(deltaTableFileName);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created");
}
return new Pair<PrintStream, File>(deltaTableStream, deltaTableFileName);
}
private static void outputRecalibrationPlot(Pair<PrintStream, File> files, boolean keepIntermediates) {
final File csvFileName = files.getSecond();
final File plotFileName = new File(csvFileName + ".pdf");
files.getFirst().close();
RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class));
executor.addArgs(csvFileName.getAbsolutePath());
executor.addArgs(plotFileName.getAbsolutePath());
executor.exec();
if (!keepIntermediates)
if (!csvFileName.delete())
throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath());
}
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), original, "ORIGINAL", true);
outputRecalibrationPlot(files, keepIntermediates);
}
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> recalibrated, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true);
writeCSV(files.getFirst(), original, "ORIGINAL", false);
outputRecalibrationPlot(files, keepIntermediates);
}
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
final int QUALITY_SCORE_COVARIATE_INDEX = 1;
final Map<BitSet, RecalDatum> deltaTable = new HashMap<BitSet, RecalDatum>();
BQSRKeyManager deltaKeyManager = null;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getOptionalCovariates().size() > 0) { // initialize with the 'all covariates' table
// create a key manager for the delta table
List<Covariate> requiredCovariates = keyManager.getRequiredCovariates().subList(0, 1); // include the read group covariate as the only required covariate
List<Covariate> optionalCovariates = keyManager.getRequiredCovariates().subList(1, 2); // include the quality score covariate as an optional covariate
optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates
deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
}
}
if (deltaKeyManager == null)
throw new ReviewedStingException ("Couldn't find the covariates table");
boolean readyToPrint = false;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getRequiredCovariates().size() == 2 && keyManager.getOptionalCovariates().isEmpty()) { // look for the QualityScore table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
// add the quality score table to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
List<Object> newCovs = new ArrayList<Object>(4);
newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score
newCovs.add(1, covs.get(1));
newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
newCovs.add(3, covs.get(2));
BitSet deltaKey = deltaKeyManager.bitSetFromKey(newCovs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
}
}
else if (keyManager.getOptionalCovariates().size() > 0) { // look for the optional covariates table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
// add the optional covariates to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
}
readyToPrint = true;
}
// output the csv file
if (readyToPrint) {
if (printHeader) {
List<String> header = new LinkedList<String>();
header.add("ReadGroup");
header.add("CovariateValue");
header.add("CovariateName");
header.add("EventType");
header.add("Observations");
header.add("Errors");
header.add("EmpiricalQuality");
header.add("AverageReportedQuality");
header.add("Accuracy");
header.add("Recalibration");
deltaTableFile.println(Utils.join(",", header));
}
// print each data line
for(Map.Entry<BitSet, RecalDatum> deltaEntry : deltaTable.entrySet()) {
List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
RecalDatum deltaDatum = deltaEntry.getValue();
deltaTableFile.print(Utils.join(",", deltaKeys));
deltaTableFile.print("," + deltaDatum.stringForCSV());
deltaTableFile.println("," + recalibrationMode);
}
}
}
}
/**
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
* that will be used in the sequential calculation in TableRecalibrationWalker
* Updates the current RecalDatum element in the delta table.
*
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
* @param maxQual At which value to cap the quality scores
*/
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual);
for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
checkForSingletons(map.data);
}
}
}
private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
for (Object comp : data.keySet()) {
final Object val = data.get(comp);
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
}
else { // Another layer in the nested hash map
recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
}
}
}
private void checkForSingletons(final Map data) {
// todo -- this looks like it's better just as a data.valueSet() call?
for (Object comp : data.keySet()) {
final Object val = data.get(comp);
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
if (data.keySet().size() == 1) {
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
// in a previous step of the sequential calculation model
}
}
else { // Another layer in the nested hash map
checkForSingletons((Map) val);
}
}
}
/**
* Get the appropriate collapsed table out of the set of all the tables held by this Object
* If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table.
*
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
* @param deltaTable the delta table
* @param deltaKey the key to the table
* @param recalDatum the recal datum to combine with the accuracyDatum element in the table
*/
public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) {
if (covariate == 0) {
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
}
else if (covariate == 1) {
return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed
}
else {
return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
}
private static void addToDeltaTable(Map<BitSet, RecalDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum) {
RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
if (deltaDatum == null)
deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum
else
deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one.
}
/**
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
*
* @param read The read to adjust
* @param RAC The list of shared command line arguments
*/
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = read.getReadGroup();
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
@ -295,262 +492,53 @@ public class RecalDataManager {
}
/**
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are
* inconsistent with the color space. If there is no call in the color space, this method returns true meaning
* this read should be skipped
*
* @param read The SAMRecord to parse
* @param strategy the strategy used for SOLID no calls
* @param read The SAMRecord to parse
* @return whether or not this read should be skipped
*/
public static void parseColorSpace(final GATKSAMRecord read) {
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
if (ReadUtils.isSOLiDRead(read)) {
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) {
if (ReadUtils.isSOLiDRead(read)) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpace;
if (attr instanceof String) {
if (attr instanceof String)
colorSpace = ((String) attr).getBytes();
}
else {
else
throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
if (read.getReadNegativeStrandFlag()) {
byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
if (read.getReadNegativeStrandFlag())
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
}
final byte[] inconsistency = new byte[readBases.length];
int iii;
byte prevBase = colorSpace[0]; // The sentinel
for (iii = 0; iii < readBases.length; iii++) {
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1);
prevBase = readBases[iii];
int i;
byte prevBase = colorSpace[0]; // The sentinel
for (i = 0; i < readBases.length; i++) {
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]);
inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1);
prevBase = readBases[i];
}
read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency);
}
else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
else {
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
else
return true; // otherwise, just skip the read
}
}
}
/**
* Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
* This method doesn't add the inconsistent tag to the read like parseColorSpace does
*
* @param read The SAMRecord to parse
* @param originalQualScores The array of original quality scores to modify during the correction
* @param solidRecalMode Which mode of solid recalibration to apply
* @param refBases The reference for this read
* @return A new array of quality scores that have been ref bias corrected
*/
public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpace;
if (attr instanceof String) {
colorSpace = ((String) attr).getBytes();
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
final byte[] colorImpliedBases = readBases.clone();
byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
if (read.getReadNegativeStrandFlag()) {
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone());
}
final int[] inconsistency = new int[readBases.length];
byte prevBase = colorSpace[0]; // The sentinel
for (int iii = 0; iii < readBases.length; iii++) {
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
colorImpliedBases[iii] = thisBase;
inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1);
prevBase = readBases[iii];
}
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0
final boolean setBaseN = false;
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
}
else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) {
final boolean setBaseN = true;
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
}
else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead);
}
}
else {
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
return originalQualScores;
}
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
if (ReadUtils.isSOLiDRead(read)) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpace;
if (attr instanceof String) {
colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
for (byte color : colorSpace) {
if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') {
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
}
}
}
else {
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
}
return false; // There aren't any color no calls in this SOLiD read
}
/**
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
*
* @param read The SAMRecord to recalibrate
* @param readBases The bases in the read which have been RC'd if necessary
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param originalQualScores The array of original quality scores to set to zero if needed
* @param refBases The reference which has been RC'd if necessary
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
* @return The byte array of original quality scores some of which might have been set to zero
*/
private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
final boolean negStrand = read.getReadNegativeStrandFlag();
for (int iii = 1; iii < originalQualScores.length; iii++) {
if (inconsistency[iii] == 1) {
if (readBases[iii] == refBases[iii]) {
if (negStrand) {
originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
}
else {
originalQualScores[iii] = (byte) 0;
}
if (setBaseN) {
readBases[iii] = (byte) 'N';
}
}
// Set the prev base to Q0 as well
if (readBases[iii - 1] == refBases[iii - 1]) {
if (negStrand) {
originalQualScores[originalQualScores.length - iii] = (byte) 0;
}
else {
originalQualScores[iii - 1] = (byte) 0;
}
if (setBaseN) {
readBases[iii - 1] = (byte) 'N';
}
}
}
}
if (negStrand) {
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases(readBases);
return originalQualScores;
}
/**
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
*
* @param read The SAMRecord to recalibrate
* @param readBases The bases in the read which have been RC'd if necessary
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
* @param refBases The reference which has been RC'd if necessary
*/
private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if (attr != null) {
byte[] colorSpaceQuals;
if (attr instanceof String) {
String x = (String) attr;
colorSpaceQuals = x.getBytes();
SAMUtils.fastqToPhred(colorSpaceQuals);
}
else {
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
for (int iii = 1; iii < inconsistency.length - 1; iii++) {
if (inconsistency[iii] == 1) {
for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
if (readBases[jjj] == refBases[jjj]) {
if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
if (rand == 0) { // The color implied base won the coin flip
readBases[jjj] = colorImpliedBases[jjj];
}
}
else {
final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
int diffInQuality = maxQuality - minQuality;
int numLow = minQuality;
if (numLow == 0) {
numLow++;
diffInQuality++;
}
final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
if (rand >= numLow) { // higher q score won
if (maxQuality == (int) colorSpaceQuals[jjj]) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had higher q score, and won out, so nothing to do here
}
else { // lower q score won
if (minQuality == (int) colorSpaceQuals[jjj]) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had lower q score, and won out, so nothing to do here
}
}
}
}
}
}
}
if (read.getReadNegativeStrandFlag()) {
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases(readBases);
}
else { // No color space quality tag in file
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
}
return false;
}
/**
* Given the base and the color calculate the next base in the sequence
*
* @param read the read
* @param prevBase The base
* @param color The color
* @return The next base in the sequence
@ -578,16 +566,16 @@ public class RecalDataManager {
* @param offset The offset in the read at which to check
* @return Returns true if the base was inconsistent with the color space
*/
public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) {
public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
if (attr != null) {
final byte[] inconsistency = (byte[]) attr;
// NOTE: The inconsistency array is in the direction of the read, not aligned to the reference!
if (read.getReadNegativeStrandFlag()) { // Negative direction
return inconsistency[inconsistency.length - offset - 1] != (byte) 0;
return inconsistency[inconsistency.length - offset - 1] == (byte) 0;
}
else { // Forward direction
return inconsistency[offset] != (byte) 0;
return inconsistency[offset] == (byte) 0;
}
// This block of code is for if you want to check both the offset and the next base for color space inconsistency
@ -607,7 +595,7 @@ public class RecalDataManager {
}
else { // No inconsistency array, so nothing is inconsistent
return false;
return true;
}
}
@ -615,22 +603,24 @@ public class RecalDataManager {
* Computes all requested covariates for every offset in the given read
* by calling covariate.getValues(..).
*
* It populates an array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*
* @param read The read for which to compute covariate values.
* @param requestedCovariates The list of requested covariates.
* @return An array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
* @return a matrix with all the covariates calculated for every base in the read
*/
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
public static ReadCovariates computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
final int numRequestedCovariates = requestedCovariates.size();
final int readLength = read.getReadLength();
final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates);
final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates);
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (Covariate covariate : requestedCovariates)
covariateKeySet.addCovariate(covariate.getValues(read));
readCovariates.addCovariate(covariate.getValues(read));
read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet);
return readCovariates;
}
/**
@ -707,4 +697,42 @@ public class RecalDataManager {
return base;
}
}
/**
* Adds the required covariates to a covariate list
*
* Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand.
*
* @param classes list of classes to add to the covariate list
* @return the covariate list
*/
private static ArrayList<Covariate> addRequiredCovariatesToList(List<Class<? extends RequiredCovariate>> classes) {
ArrayList<Covariate> dest = new ArrayList<Covariate>(classes.size());
if (classes.size() != 2)
throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected");
dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next.
dest.add(new QualityScoreCovariate());
return dest;
}
/**
* Adds the standard covariates to a covariate list
*
* @param classes list of classes to add to the covariate list
* @return the covariate list
*/
private static ArrayList<Covariate> addStandardCovariatesToList(List<Class<? extends StandardCovariate>> classes) {
ArrayList<Covariate> dest = new ArrayList<Covariate>(classes.size());
for (Class<?> covClass : classes) {
try {
final Covariate covariate = (Covariate) covClass.newInstance();
dest.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
}
return dest;
}
}

View File

@ -25,6 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Random;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -33,10 +37,11 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
*/
public class RecalDatum extends RecalDatumOptimized {
public class RecalDatum extends Datum {
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
//---------------------------------------------------------------------------------------------------------------
//
@ -48,7 +53,7 @@ public class RecalDatum extends RecalDatumOptimized {
numObservations = 0L;
numMismatches = 0L;
estimatedQReported = 0.0;
empiricalQuality = 0.0;
empiricalQuality = -1.0;
}
public RecalDatum(final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality) {
@ -65,48 +70,81 @@ public class RecalDatum extends RecalDatumOptimized {
this.empiricalQuality = copy.empiricalQuality;
}
//---------------------------------------------------------------------------------------------------------------
//
// increment methods
//
//---------------------------------------------------------------------------------------------------------------
public final void combine(final RecalDatum other) {
public void combine(final RecalDatum other) {
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment(other.numObservations, other.numMismatches);
this.estimatedQReported = -10 * Math.log10(sumErrors / (double) this.numObservations);
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
this.estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations);
this.empiricalQuality = -1.0; // reset the empirical quality calculation so we never have a wrongly calculated empirical quality stored
}
//---------------------------------------------------------------------------------------------------------------
//
// methods to derive empirical quality score
//
//---------------------------------------------------------------------------------------------------------------
public final void calcCombinedEmpiricalQuality(final int smoothing, final int maxQual) {
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
public final void calcCombinedEmpiricalQuality() {
this.empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again
}
public final void calcEstimatedReportedQuality() {
this.estimatedQReported = -10 * Math.log10(calcExpectedErrors() / numObservations);
}
//---------------------------------------------------------------------------------------------------------------
//
// misc. methods
//
//---------------------------------------------------------------------------------------------------------------
public final double getEstimatedQReported() {
return estimatedQReported;
}
public final double getEmpiricalQuality() {
if (empiricalQuality < 0)
calcCombinedEmpiricalQuality();
return empiricalQuality;
}
private double calcExpectedErrors() {
/**
* Makes a hard copy of the recal datum element
*
* @return a new recal datum object with the same contents of this datum.
*/
public RecalDatum copy() {
return new RecalDatum(numObservations, numMismatches, estimatedQReported, empiricalQuality);
}
@Override
public String toString() {
return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()));
}
public String stringForCSV() {
return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported());
}
private double calcExpectedErrors() {
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
}
private double qualToErrorProb(final double qual) {
return Math.pow(10.0, qual / -10.0);
}
public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
Random random = new Random();
int nObservations = random.nextInt(maxObservations);
int nErrors = random.nextInt(maxErrors);
Datum datum = new Datum(nObservations, nErrors);
double empiricalQuality = datum.empiricalQualDouble();
double estimatedQReported = empiricalQuality + ((10 * random.nextDouble()) - 5); // empirical quality +/- 5.
return new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
}
/**
* We don't compare the estimated quality reported because it may be different when read from
* report tables.
*
* @param o the other recal datum
* @return true if the two recal datums have the same number of observations, errors and empirical quality.
*/
@Override
public boolean equals(Object o) {
if (!(o instanceof RecalDatum))
return false;
RecalDatum other = (RecalDatum) o;
return super.equals(o) &&
MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0;
}
}

View File

@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.Utils;
import java.io.PrintStream;
import java.util.ArrayList;
import java.io.File;
import java.util.Collections;
import java.util.List;
@ -52,7 +52,7 @@ public class RecalibrationArgumentCollection {
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
protected List<RodBinding<Feature>> knownSites = Collections.emptyList();
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
@ -60,27 +60,27 @@ public class RecalibrationArgumentCollection {
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Gather(CountCovariatesGatherer.class)
@Gather(BQSRGatherer.class)
@Output
protected PrintStream RECAL_FILE;
public File RECAL_FILE;
/**
* List all implemented covariates.
*/
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
protected boolean LIST_ONLY = false;
public boolean LIST_ONLY = false;
/**
* Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list.
*/
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
protected String[] COVARIATES = null;
public String[] COVARIATES = null;
/*
* Use the standard set of covariates in addition to the ones listed using the -cov argument
*/
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
protected boolean USE_STANDARD_COVARIATES = true;
public boolean USE_STANDARD_COVARIATES = true;
/////////////////////////////
// Debugging-only Arguments
@ -90,17 +90,7 @@ public class RecalibrationArgumentCollection {
*/
@Hidden
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
protected boolean RUN_WITHOUT_DBSNP = false;
/////////////////////////////
// protected Member Variables
/////////////////////////////
protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
protected final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();// A list to hold the covariate objects that were requested
protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
public boolean RUN_WITHOUT_DBSNP = false;
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
@ -153,6 +143,19 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
public byte DELETIONS_DEFAULT_QUALITY = 45;
/**
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality
*/
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
public byte LOW_QUAL_TAIL = 2;
/**
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options.
* This parameter tells BQSR the number of levels of quantization to use to build the quantization table.
*/
@Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output")
public int QUANTIZING_LEVELS = 16;
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
@ -160,6 +163,37 @@ public class RecalibrationArgumentCollection {
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots")
public boolean KEEP_INTERMEDIATE_FILES = false;
@Hidden
@Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering")
public boolean NO_PLOTS = false;
public File recalibrationReport = null;
public GATKReportTable generateReportTable() {
GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run");
argumentsTable.addPrimaryKey("Argument");
argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, "null");
argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES));
argumentsTable.set("standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, USE_STANDARD_COVARIATES);
argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP);
argumentsTable.set("solid_recal_mode", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE);
argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE);
argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE);
argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.set("insertions_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY);
argumentsTable.set("low_quality_tail", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL);
argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM);
argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM);
argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS);
argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES);
argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS);
argumentsTable.set("recalibration_report", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath());
return argumentsTable;
}
}

View File

@ -0,0 +1,359 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* This class has all the static functionality for reading a recalibration report file into memory.
*
* @author carneiro
* @since 3/26/12
*/
public class RecalibrationReport {
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
private final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter
public RecalibrationReport(final File RECAL_FILE) {
GATKReport report = new GATKReport(RECAL_FILE);
argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE);
RAC = initializeArgumentCollectionTable(argumentTable);
GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE);
quantizationInfo = initializeQuantizationTable(quantizedTable);
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check.";
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
table = parseReadGroupTable(keyManager, reportTable);
}
else if (nRequiredCovariates == 2 && optionalCovariatesToAdd.isEmpty()) { // when we have both required covariates and no optional covariates we're at the QUAL table
final GATKReportTable reportTable = report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE);
table = parseQualityScoreTable(keyManager, reportTable);
}
else
throw new ReviewedStingException(UNRECOGNIZED_REPORT_TABLE_EXCEPTION);
keysAndTablesMap.put(keyManager, table); // adding the pair key+table to the map
}
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table);
}
protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) {
this.quantizationInfo = quantizationInfo;
this.keysAndTablesMap = keysAndTablesMap;
this.argumentTable = argumentTable;
this.RAC = RAC;
}
/**
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
* them after combining. The reason for not calculating it is because this function is inteded for combining a
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
* makes this method faster
*
* Note2: The empirical quality reported, however, is recalculated given its simplicity.
*
* @param other the recalibration report to combine with this one
*/
public void combine(RecalibrationReport other) {
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> thisEntry = thisIterator.next();
Map<BitSet, RecalDatum> thisTable = thisEntry.getValue();
BQSRKeyManager thisKeyManager = thisEntry.getKey();
BQSRKeyManager otherKeyManager = otherEntry.getKey();
for (Map.Entry<BitSet, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) {
RecalDatum otherDatum = otherTableEntry.getValue();
BitSet otherBitKey = otherTableEntry.getKey();
List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey);
BitSet thisBitKey = thisKeyManager.bitSetFromKey(otherObjectKey.toArray());
RecalDatum thisDatum = thisTable.get(thisBitKey);
if (thisDatum == null)
thisTable.put(thisBitKey, otherDatum);
else
thisDatum.combine(otherDatum);
}
}
}
public QuantizationInfo getQuantizationInfo() {
return quantizationInfo;
}
public LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> getKeysAndTablesMap() {
return keysAndTablesMap;
}
public ArrayList<Covariate> getRequestedCovariates() {
return requestedCovariates;
}
/**
* Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false);
}
/**
*
* Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false);
}
/**
* Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, true);
}
/**
* Shared parsing functionality for all tables.
*
* @param keyManager the key manager for this table
* @param reportTable the GATKReport table containing data for this table
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) {
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);
for (Object primaryKey : reportTable.getPrimaryKeys()) {
int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys];
for (int i = 0; i < nKeys; i++)
keySet[i] = reportTable.get(primaryKey, columnNamesOrderedList.get(i)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet);
long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table
(Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table
Byte.parseByte((String) reportTable.get(primaryKey, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
result.put(bitKey, recalDatum);
}
return result;
}
/**
* Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores
*
* @param table the GATKReportTable containing the quantization mappings
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
for (Object primaryKey : table.getPrimaryKeys()) {
Object quantizedObject = table.get(primaryKey, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
Object countObject = table.get(primaryKey, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
byte originalQual = Byte.parseByte(primaryKey.toString());
byte quantizedQual = Byte.parseByte(quantizedObject.toString());
long quantizedCount = Long.parseLong(countObject.toString());
quals[originalQual] = quantizedQual;
counts[originalQual] = quantizedCount;
}
return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts));
}
/**
* Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values
*
* @param table the GATKReportTable containing the arguments and its corresponding values
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for (Object primaryKey : table.getPrimaryKeys()) {
Object value = table.get(primaryKey, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME);
if (value.equals("null"))
value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport
if (primaryKey.equals("covariate") && value != null)
RAC.COVARIATES = value.toString().split(",");
else if (primaryKey.equals("standard_covs"))
RAC.USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("solid_recal_mode"))
RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value);
else if (primaryKey.equals("solid_nocall_strategy"))
RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value);
else if (primaryKey.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("insertions_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (primaryKey.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("insertions_default_quality"))
RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("deletions_default_quality"))
RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value);
else if (primaryKey.equals("low_quality_tail"))
RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value);
else if (primaryKey.equals("default_platform"))
RAC.DEFAULT_PLATFORM = (String) value;
else if (primaryKey.equals("force_platform"))
RAC.FORCE_PLATFORM = (String) value;
else if (primaryKey.equals("quantizing_levels"))
RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value);
else if (primaryKey.equals("keep_intermediate_files"))
RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("no_plots"))
RAC.NO_PLOTS = Boolean.parseBoolean((String) value);
else if (primaryKey.equals("recalibration_report"))
RAC.recalibrationReport = (value == null) ? null : new File((String) value);
}
return RAC;
}
/**
* this functionality avoids recalculating the empirical qualities, estimated reported quality
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/
public void calculateEmpiricalAndQuantizedQualities() {
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values())
for (RecalDatum datum : table.values())
datum.calcCombinedEmpiricalQuality();
quantizationInfo = new QuantizationInfo(keysAndTablesMap, RAC.QUANTIZING_LEVELS);
}
public void output(PrintStream output) {
RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, keysAndTablesMap, output);
}
public RecalibrationArgumentCollection getRAC() {
return RAC;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof RecalibrationReport))
return false;
RecalibrationReport other = (RecalibrationReport) o;
if (this == o)
return true;
return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap);
}
private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t2) {
if (t1.size() != t2.size())
return false;
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t1Iterator = t1.entrySet().iterator();
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t2Iterator = t2.entrySet().iterator();
while (t1Iterator.hasNext() && t2Iterator.hasNext()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t1MapEntry = t1Iterator.next();
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t2MapEntry = t2Iterator.next();
if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey())))
return false;
Map<BitSet, RecalDatum> table2 = t2MapEntry.getValue();
for (Map.Entry<BitSet, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) {
BitSet t1Key = t1TableEntry.getKey();
if (!table2.containsKey(t1Key))
return false;
RecalDatum t1Datum = t1TableEntry.getValue();
if (!t1Datum.equals(table2.get(t1Key)))
return false;
}
}
return true;
}
}

View File

@ -1,23 +1,25 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* * OTHER DEALINGS IN THE SOFTWARE.
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
@ -42,40 +44,40 @@ import java.io.PrintStream;
/**
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
*
* <p/>
* <p>
* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
* considers the coverage at each locus and emits either a per base state or a summary interval BED file that
* partitions the genomic intervals into the following callable states:
* <dl>
* <dt>REF_N</dt>
* <dd>the reference base was an N, which is not considered callable the GATK</dd>
* <dt>CALLABLE</dt>
* <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd>
* <dt>NO_COVERAGE</dt>
* <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd>
* <dt>LOW_COVERAGE</dt>
* <dd>there were less than min. depth bases at the locus, after applying filters</dd>
* <dt>EXCESSIVE_COVERAGE</dt>
* <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd>
* <dt>POOR_MAPPING_QUALITY</dt>
* <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd>
* <dt>REF_N</dt>
* <dd>the reference base was an N, which is not considered callable the GATK</dd>
* <dt>PASS</dt>
* <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd>
* <dt>NO_COVERAGE</dt>
* <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd>
* <dt>LOW_COVERAGE</dt>
* <dd>there were less than min. depth bases at the locus, after applying filters</dd>
* <dt>EXCESSIVE_COVERAGE</dt>
* <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd>
* <dt>POOR_MAPPING_QUALITY</dt>
* <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd>
* </dl>
* </p>
*
* <p/>
* <h2>Input</h2>
* <p>
* A BAM file containing <b>exactly one sample</b>.
* A BAM file containing <b>exactly one sample</b>.
* </p>
*
* <p/>
* <h2>Output</h2>
* <p>
* <ul>
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
* <li>-summary: a table of callable status x count of all examined bases</li>
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
* <li>-summary: a table of callable status x count of all examined bases</li>
* </ul>
* </p>
*
* <p/>
* <h2>Examples</h2>
* <pre>
* -T CallableLociWalker \
@ -83,31 +85,31 @@ import java.io.PrintStream;
* -summary my.summary \
* -o my.bed
* </pre>
*
* <p/>
* would produce a BED file (my.bed) that looks like:
*
* <p/>
* <pre>
* 20 10000000 10000864 CALLABLE
* 20 10000000 10000864 PASS
* 20 10000865 10000985 POOR_MAPPING_QUALITY
* 20 10000986 10001138 CALLABLE
* 20 10000986 10001138 PASS
* 20 10001139 10001254 POOR_MAPPING_QUALITY
* 20 10001255 10012255 CALLABLE
* 20 10001255 10012255 PASS
* 20 10012256 10012259 POOR_MAPPING_QUALITY
* 20 10012260 10012263 CALLABLE
* 20 10012260 10012263 PASS
* 20 10012264 10012328 POOR_MAPPING_QUALITY
* 20 10012329 10012550 CALLABLE
* 20 10012329 10012550 PASS
* 20 10012551 10012551 LOW_COVERAGE
* 20 10012552 10012554 CALLABLE
* 20 10012552 10012554 PASS
* 20 10012555 10012557 LOW_COVERAGE
* 20 10012558 10012558 CALLABLE
* 20 10012558 10012558 PASS
* et cetera...
* </pre>
* as well as a summary table that looks like:
*
* <p/>
* <pre>
* state nBases
* REF_N 0
* CALLABLE 996046
* PASS 996046
* NO_COVERAGE 121
* LOW_COVERAGE 928
* EXCESSIVE_COVERAGE 0
@ -139,21 +141,21 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
byte maxLowMAPQ = 1;
/**
* Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
* Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS
* state.
*/
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
byte minMappingQuality = 10;
/**
* Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
* Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state
*/
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
byte minBaseQuality = 20;
/**
* If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
* value and is less than maxDepth the site is considered CALLABLE.
* value and is less than maxDepth the site is considered PASS.
*/
@Advanced
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
@ -191,7 +193,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public enum OutputFormat {
/**
* The output will be written as a BED file. There's a BED element for each
* continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended
* continuous run of callable states (i.e., PASS, REF_N, etc). This is the recommended
* format
*/
BED,
@ -204,17 +206,29 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
}
public enum CalledState {
/** the reference base was an N, which is not considered callable the GATK */
/**
* the reference base was an N, which is not considered callable the GATK
*/
REF_N,
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
/**
* the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
*/
CALLABLE,
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */
/**
* absolutely no reads were seen at this locus, regardless of the filtering parameters
*/
NO_COVERAGE,
/** there were less than min. depth bases at the locus, after applying filters */
/**
* there were less than min. depth bases at the locus, after applying filters
*/
LOW_COVERAGE,
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */
/**
* more than -maxDepth read at the locus, indicating some sort of mapping problem
*/
EXCESSIVE_COVERAGE,
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
/**
* more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*/
POOR_MAPPING_QUALITY
}
@ -223,11 +237,13 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
////////////////////////////////////////////////////////////////////////////////////
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
public boolean includeReadsWithDeletionAtLoci() {
return true;
}
@Override
public void initialize() {
if ( getSampleDB().getSamples().size() != 1 ) {
if (getSampleDB().getSamples().size() != 1) {
throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getSampleDB().getSamples());
}
@ -249,7 +265,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public GenomeLoc loc;
final public CalledState state;
public CallableBaseState(GenomeLocParser genomeLocParser,GenomeLoc loc, CalledState state) {
public CallableBaseState(GenomeLocParser genomeLocParser, GenomeLoc loc, CalledState state) {
this.genomeLocParser = genomeLocParser;
this.loc = loc;
this.state = state;
@ -264,12 +280,13 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
}
// update routines
public boolean changingState( CalledState newState ) {
public boolean changingState(CalledState newState) {
return state != newState;
}
/**
* Updating the location of this CalledBaseState by the new stop location
*
* @param newStop
*/
public void update(GenomeLoc newStop) {
@ -285,7 +302,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
CalledState state;
if ( BaseUtils.isNBase(ref.getBase()) ) {
if (BaseUtils.isNBase(ref.getBase())) {
state = CalledState.REF_N;
} else {
// count up the depths of all and QC+ bases
@ -293,29 +310,29 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
for (PileupElement e : context.getBasePileup()) {
rawDepth++;
if ( e.getMappingQual() <= maxLowMAPQ )
if (e.getMappingQual() <= maxLowMAPQ)
lowMAPQDepth++;
if ( e.getMappingQual() >= minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) {
if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
QCDepth++;
}
}
//System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth);
if ( rawDepth == 0 ) {
if (rawDepth == 0) {
state = CalledState.NO_COVERAGE;
} else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) {
} else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) {
state = CalledState.POOR_MAPPING_QUALITY;
} else if ( QCDepth < minDepth ) {
} else if (QCDepth < minDepth) {
state = CalledState.LOW_COVERAGE;
} else if ( rawDepth >= maxDepth && maxDepth != -1 ) {
} else if (rawDepth >= maxDepth && maxDepth != -1) {
state = CalledState.EXCESSIVE_COVERAGE;
} else {
state = CalledState.CALLABLE;
}
}
return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state);
}
@Override
@ -328,15 +345,15 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
// update counts
integrator.counts[state.getState().ordinal()]++;
if ( outputFormat == OutputFormat.STATE_PER_BASE ) {
if (outputFormat == OutputFormat.STATE_PER_BASE) {
out.println(state.toString());
}
// format is integrating
if ( integrator.state == null )
if (integrator.state == null)
integrator.state = state;
else if ( state.getLocation().getStart() != integrator.state.getLocation().getStop() + 1 ||
integrator.state.changingState(state.getState()) ) {
else if (state.getLocation().getStart() != integrator.state.getLocation().getStop() + 1 ||
integrator.state.changingState(state.getState())) {
out.println(integrator.state.toString());
integrator.state = state;
} else {
@ -354,14 +371,14 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
@Override
public void onTraversalDone(Integrator result) {
// print out the last state
if ( result != null ) {
if ( outputFormat == OutputFormat.BED ) // get the last interval
if (result != null) {
if (outputFormat == OutputFormat.BED) // get the last interval
out.println(result.state.toString());
try {
PrintStream summaryOut = new PrintStream(summaryFile);
summaryOut.printf("%30s %s%n", "state", "nBases");
for ( CalledState state : CalledState.values() ) {
for (CalledState state : CalledState.values()) {
summaryOut.printf("%30s %d%n", state, result.counts[state.ordinal()]);
}
summaryOut.close();

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.HashMap;
import java.util.Map;
@ -45,8 +46,7 @@ public class DepthOfCoverageStats {
public static int[] calculateBinEndpoints(int lower, int upper, int bins) {
if ( bins > upper - lower || lower < 1 ) {
throw new IllegalArgumentException("Illegal argument to calculateBinEndpoints; "+
"lower bound must be at least 1, and number of bins may not exceed stop - start");
throw new UserException.BadInput("the start must be at least 1 and the number of bins may not exceed stop - start");
}
int[] binLeftEndpoints = new int[bins+1];

View File

@ -74,10 +74,6 @@ public class GCContentByIntervalWalker extends LocusWalker<Long, Long> {
public void initialize() {
}
public boolean generateExtendedEvents() {
return false;
}
public Long reduceInit() {
return 0L;
}

View File

@ -91,6 +91,25 @@ public class ErrorRatePerCycle extends LocusWalker<Integer, Integer> {
this.cycle = cycle;
}
// Must overload hashCode and equals to properly work with GATKReportColumn
@Override
public int hashCode() {
return readGroup.hashCode() + 33 * cycle;
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final TableKey oKey = (TableKey) o;
if ( cycle != oKey.cycle ) return false;
if ( !readGroup.equals(oKey.readGroup) ) return false;
return true;
}
@Override
public int compareTo(final TableKey tableKey) {
final int scmp = readGroup.compareTo(tableKey.readGroup);

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
/**
@ -7,16 +31,40 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
* @since 2/1/12
*/
public enum CallableStatus {
/** the reference base was an N, which is not considered callable the GATK */
REF_N,
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
CALLABLE,
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */
NO_COVERAGE,
/** there were less than min. depth bases at the locus, after applying filters */
LOW_COVERAGE,
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */
EXCESSIVE_COVERAGE,
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
POOR_QUALITY
/**
* the reference base was an N, which is not considered callable the GATK
*/
// todo -- implement this status
REF_N("the reference base was an N, which is not considered callable the GATK"),
/**
* the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
*/
PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
/**
* absolutely no reads were seen at this locus, regardless of the filtering parameters
*/
NO_COVERAGE("absolutely no reads were seen at this locus, regardless of the filtering parameters"),
/**
* there were less than min. depth bases at the locus, after applying filters
*/
LOW_COVERAGE("there were less than min. depth bases at the locus, after applying filters"),
/**
* more than -maxDepth read at the locus, indicating some sort of mapping problem
*/
EXCESSIVE_COVERAGE("more than -maxDepth read at the locus, indicating some sort of mapping problem"),
/**
* more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*/
POOR_QUALITY("more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads"),
BAD_MATE(""),
INCONSISTENT_COVERAGE("");
public String description;
private CallableStatus(String description) {
this.description = description;
}
}

View File

@ -1,45 +1,66 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import net.sf.picard.util.PeekableIterator;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocComparator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import java.util.*;
/**
* Short one line description of the walker.
*
* <p/>
* <p>
* [Long description of the walker]
* </p>
*
*
* <p/>
* <p/>
* <h2>Input</h2>
* <p>
* [Description of the Input]
* </p>
*
* <p/>
* <h2>Output</h2>
* <p>
* [Description of the Output]
* </p>
*
* <p/>
* <h2>Examples</h2>
* <pre>
* java
@ -51,15 +72,13 @@ import java.util.TreeSet;
* @since 2/1/12
*/
@By(value = DataSource.READS)
public class DiagnoseTargets extends LocusWalker<Long, Long> {
@PartitionBy(PartitionType.INTERVAL)
public class DiagnoseTargets extends LocusWalker<Long, Long> implements AnnotatorCompatibleWalker {
@Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
private IntervalBinding<Feature> intervalTrack = null;
@Output
private PrintStream out = System.out;
@Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
private int expandInterval = 50;
@Output(doc = "File to which variants should be written", required = true)
private VCFWriter vcfWriter = null;
@Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false)
private int minimumBaseQuality = 20;
@ -73,13 +92,11 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
private int maximumCoverage = 700;
private TreeSet<GenomeLoc> intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them)
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics
private Iterator<GenomeLoc> intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome
private GenomeLoc currentInterval = null; // The "current" interval loaded and being filled with statistics
private IntervalStatistics currentIntervalStatistics = null; // The "current" interval loaded and being filled with statistics
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set<String> samples = null; // all the samples being processed
private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals)
private final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
@Override
public void initialize() {
@ -88,38 +105,22 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
if (intervalTrack == null)
throw new UserException("This tool currently only works if you provide an interval track");
parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>();
intervalListIterator = new PeekableIterator<GenomeLoc>(intervalTrack.getIntervals(getToolkit()).listIterator());
List<GenomeLoc> originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator());
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(originalList.size() * 2);
for (GenomeLoc interval : originalList)
addAndExpandIntervalToLists(interval);
intervalListIterator = intervalList.iterator();
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header
}
@Override
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
GenomeLoc refLocus = ref.getLocus();
while (currentInterval == null || currentInterval.isBefore(refLocus)) {
if (!intervalListIterator.hasNext())
return 0L;
currentInterval = intervalListIterator.next();
currentIntervalStatistics = intervalMap.get(currentInterval);
}
removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore
addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus
if (currentInterval.isPast(refLocus))
return 0L;
byte[] mappingQualities = context.getBasePileup().getMappingQuals();
byte[] baseQualities = context.getBasePileup().getQuals();
int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
int rawCoverage = context.size();
IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage);
currentIntervalStatistics.addLocus(refLocus, locusData);
for (IntervalStatistics intervalStatistics : intervalMap.values())
intervalStatistics.addLocus(context); // Add current locus to stats
return 1L;
}
@ -129,44 +130,159 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
return 0L;
}
/**
* Not sure what we are going to do here
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return a long
*/
@Override
public Long reduce(Long value, Long sum) {
return sum + value;
}
/**
* Process all remaining intervals
*
* @param result number of loci processed by the walker
*/
@Override
public void onTraversalDone(Long result) {
super.onTraversalDone(result);
out.println("Interval\tCallStatus\tCOV\tAVG");
for (GenomeLoc interval : intervalList) {
IntervalStatistics stats = intervalMap.get(interval);
out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage()));
for (GenomeLoc interval : intervalMap.keySet())
processIntervalStats(intervalMap.get(interval), Allele.create("A"));
}
@Override
public RodBinding<VariantContext> getSnpEffRodBinding() {return null;}
@Override
public RodBinding<VariantContext> getDbsnpRodBinding() {return null;}
@Override
public List<RodBinding<VariantContext>> getCompRodBindings() {return null;}
@Override
public List<RodBinding<VariantContext>> getResourceRodBindings() {return null;}
@Override
public boolean alwaysAppendDbsnpId() {return false;}
/**
* Removes all intervals that are behind the current reference locus from the intervalMap
*
* @param refLocus the current reference locus
* @param refBase the reference allele
*/
private void removePastIntervals(GenomeLoc refLocus, byte refBase) {
List<GenomeLoc> toRemove = new LinkedList<GenomeLoc>();
for (GenomeLoc interval : intervalMap.keySet())
if (interval.isBefore(refLocus)) {
processIntervalStats(intervalMap.get(interval), Allele.create(refBase, true));
toRemove.add(interval);
}
for (GenomeLoc interval : toRemove)
intervalMap.remove(interval);
GenomeLoc interval = intervalListIterator.peek(); // clean up all intervals that we might have skipped because there was no data
while(interval != null && interval.isBefore(refLocus)) {
interval = intervalListIterator.next();
processIntervalStats(createIntervalStatistic(interval), Allele.create(refBase, true));
interval = intervalListIterator.peek();
}
}
private GenomeLoc createIntervalBefore(GenomeLoc interval) {
int start = Math.max(interval.getStart() - expandInterval, 0);
int stop = Math.max(interval.getStart() - 1, 0);
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
}
private GenomeLoc createIntervalAfter(GenomeLoc interval) {
int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength();
int start = Math.min(interval.getStop() + 1, contigLimit);
int stop = Math.min(interval.getStop() + expandInterval, contigLimit);
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
}
private void addAndExpandIntervalToLists(GenomeLoc interval) {
if (expandInterval > 0) {
GenomeLoc before = createIntervalBefore(interval);
GenomeLoc after = createIntervalAfter(interval);
intervalList.add(before);
intervalList.add(after);
intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
/**
* Adds all intervals that overlap the current reference locus to the intervalMap
*
* @param refLocus the current reference locus
*/
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek();
while (interval != null && !interval.isPast(refLocus)) {
System.out.println("LOCUS : " + refLocus + " -- " + interval);
intervalMap.put(interval, createIntervalStatistic(interval));
intervalListIterator.next(); // discard the interval (we've already added it to the map)
interval = intervalListIterator.peek();
}
intervalList.add(interval);
intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
}
/**
* Takes the interval, finds it in the stash, prints it to the VCF, and removes it
*
* @param stats The statistics of the interval
* @param refAllele the reference allele
*/
private void processIntervalStats(IntervalStatistics stats, Allele refAllele) {
GenomeLoc interval = stats.getInterval();
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
alleles.add(refAllele);
alleles.add(SYMBOLIC_ALLELE);
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
vcb.filters(statusesToStrings(stats.callableStatuses()));
attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage());
vcb = vcb.attributes(attributes);
for (String sample : samples) {
Map<String, Object> infos = new HashMap<String, Object>();
infos.put(VCFConstants.DEPTH_KEY, stats.getSample(sample).averageCoverage());
Set<String> filters = new HashSet<String>();
filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses()));
genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false));
}
vcb = vcb.genotypes(genotypes);
vcfWriter.add(vcb.make());
}
/**
* Gets the header lines for the VCF writer
*
* @return A set of VCF header lines
*/
private static Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
// INFO fields for overall data
headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
headerLines.add(new VCFHeaderLine(stat.name(), stat.description));
return headerLines;
}
private static Set<String> statusesToStrings(Set<CallableStatus> statuses) {
Set<String> output = new HashSet<String>(statuses.size());
for (CallableStatus status : statuses)
output.add(status.name());
return output;
}
private IntervalStatistics createIntervalStatistic(GenomeLoc interval) {
return new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality);
}
}

View File

@ -1,34 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
/**
* The definition of a locus for the DiagnoseTargets walker statistics calculation
*
* @author Mauricio Carneiro
* @since 2/3/12
*/
class IntervalStatisticLocus {
private final byte[] mappingQuality;
private final byte[] baseQuality;
private final int coverage;
private final int rawCoverage;
public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) {
this.mappingQuality = mappingQuality;
this.baseQuality = baseQuality;
this.coverage = coverage;
this.rawCoverage = rawCoverage;
}
public IntervalStatisticLocus() {
this(new byte[1], new byte[1], 0, 0);
}
public int getCoverage() {
return coverage;
}
public int getRawCoverage() {
return rawCoverage;
}
}

View File

@ -1,122 +1,105 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Short one line description of the walker.
*
* @author Mauricio Carneiro
* @since 2/1/12
*/
class IntervalStatistics {
public class IntervalStatistics {
private final Map<String, SampleStatistics> samples;
private final GenomeLoc interval;
private final ArrayList<IntervalStatisticLocus> loci;
private final int minimumCoverageThreshold;
private final int maximumCoverageThreshold;
private final int minimumMappingQuality;
private final int minimumBaseQuality;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private IntervalStatistics(GenomeLoc interval, ArrayList<IntervalStatisticLocus> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
public IntervalStatistics(Set<String> samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this.interval = interval;
this.loci = loci;
this.minimumCoverageThreshold = minimumCoverageThreshold;
this.maximumCoverageThreshold = maximumCoverageThreshold;
this.minimumMappingQuality = minimumMappingQuality;
this.minimumBaseQuality = minimumBaseQuality;
this.samples = new HashMap<String, SampleStatistics>(samples.size());
for (String sample : samples)
this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality));
}
public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this(interval, new ArrayList<IntervalStatisticLocus>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality);
public SampleStatistics getSample(String sample) {
return samples.get(sample);
}
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
for (int i = 0; i < interval.size(); i++)
this.loci.add(i, new IntervalStatisticLocus());
public GenomeLoc getInterval() {
return interval;
}
public void addLocus(AlignmentContext context) {
ReadBackedPileup pileup = context.getBasePileup();
Map<String, ReadBackedPileup> samplePileups = pileup.getPileupsForSamples(samples.keySet());
for (Map.Entry<String, ReadBackedPileup> entry : samplePileups.entrySet()) {
String sample = entry.getKey();
ReadBackedPileup samplePileup = entry.getValue();
SampleStatistics sampleStatistics = samples.get(sample);
if (sampleStatistics == null)
throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample));
sampleStatistics.addLocus(context.getLocation(), samplePileup);
}
}
public long totalCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
return preComputedTotalCoverage;
}
public double averageCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
return (double) preComputedTotalCoverage / loci.size();
}
/**
* Calculates the callable status of the entire interval
*
* @return the callable status of the entire interval
*/
public CallableStatus callableStatus() {
long max = -1;
CallableStatus maxCallableStatus = null;
HashMap<CallableStatus, Integer> statusCounts = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
// initialize the statusCounts with all callable states
for (CallableStatus key : CallableStatus.values())
statusCounts.put(key, 0);
// calculate the callable status for each locus
for (int i = 0; i < loci.size(); i++) {
CallableStatus status = callableStatus(i);
int count = statusCounts.get(status) + 1;
statusCounts.put(status, count);
if (count > max) {
max = count;
maxCallableStatus = status;
}
}
return maxCallableStatus;
}
public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) {
if (!interval.containsP(locus))
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
int locusIndex = locus.getStart() - interval.getStart();
loci.add(locusIndex, locusData);
}
/**
* returns the callable status of this locus without taking the reference base into account.
*
* @param locusIndex location in the genome to inquire (only one locus)
* @return the callable status of a locus
*/
private CallableStatus callableStatus(int locusIndex) {
if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold)
return CallableStatus.EXCESSIVE_COVERAGE;
if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold)
return CallableStatus.CALLABLE;
if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold)
return CallableStatus.POOR_QUALITY;
if (loci.get(locusIndex).getRawCoverage() > 0)
return CallableStatus.LOW_COVERAGE;
return CallableStatus.NO_COVERAGE;
return (double) preComputedTotalCoverage / interval.size();
}
private void calculateTotalCoverage() {
preComputedTotalCoverage = 0;
for (IntervalStatisticLocus locus : loci)
preComputedTotalCoverage += locus.getCoverage();
for (SampleStatistics sample : samples.values())
preComputedTotalCoverage += sample.totalCoverage();
}
/**
* Return the Callable statuses for the interval as a whole
* todo -- add a voting system for sample flags and add interval specific statuses
*
* @return the callable status(es) for the whole interval
*/
public Set<CallableStatus> callableStatuses() {
Set<CallableStatus> output = new HashSet<CallableStatus>();
for (SampleStatistics sample : samples.values())
output.addAll(sample.getCallableStatuses());
return output;
}
}

View File

@ -0,0 +1,83 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import java.util.HashSet;
import java.util.Set;
public class LocusStatistics {
final int coverage;
final int rawCoverage;
public LocusStatistics() {
this.coverage = 0;
this.rawCoverage = 0;
}
public LocusStatistics(int coverage, int rawCoverage) {
this.coverage = coverage;
this.rawCoverage = rawCoverage;
}
public int getCoverage() {
return coverage;
}
public int getRawCoverage() {
return rawCoverage;
}
/**
* Generates all applicable statuses from the coverages in this locus
*
* @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality
* @param maximumCoverageThreshold the maximum threshold for determining excessive coverage
* @return a set of all statuses that apply
*/
public Set<CallableStatus> callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) {
Set<CallableStatus> output = new HashSet<CallableStatus>();
// if too much coverage
if (getCoverage() > maximumCoverageThreshold)
output.add(CallableStatus.EXCESSIVE_COVERAGE);
// if not enough coverage
if (getCoverage() < minimumCoverageThreshold) {
// was there a lot of low Qual coverage?
if (getRawCoverage() >= minimumCoverageThreshold)
output.add(CallableStatus.POOR_QUALITY);
// no?
else {
// is there any coverage?
if (getRawCoverage() > 0)
output.add(CallableStatus.LOW_COVERAGE);
else
output.add(CallableStatus.NO_COVERAGE);
}
}
return output;
}
}

View File

@ -0,0 +1,175 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
/**
* Short one line description of the walker.
*
* @author Mauricio Carneiro
* @since 2/1/12
*/
class SampleStatistics {
private final GenomeLoc interval;
private final ArrayList<LocusStatistics> loci;
private final int minimumCoverageThreshold;
private final int maximumCoverageThreshold;
private final int minimumMappingQuality;
private final int minimumBaseQuality;
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
private SampleStatistics(GenomeLoc interval, ArrayList<LocusStatistics> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this.interval = interval;
this.loci = loci;
this.minimumCoverageThreshold = minimumCoverageThreshold;
this.maximumCoverageThreshold = maximumCoverageThreshold;
this.minimumMappingQuality = minimumMappingQuality;
this.minimumBaseQuality = minimumBaseQuality;
}
public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
this(interval, new ArrayList<LocusStatistics>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality);
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
for (int i = 0; i < interval.size(); i++)
this.loci.add(i, new LocusStatistics());
}
public long totalCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
return preComputedTotalCoverage;
}
public double averageCoverage() {
if (preComputedTotalCoverage < 0)
calculateTotalCoverage();
return (double) preComputedTotalCoverage / loci.size();
}
/**
* Calculates the callable statuses of the entire interval
*
* @return the callable statuses of the entire interval
*/
public Set<CallableStatus> getCallableStatuses() {
Map<CallableStatus, Integer> totals = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
// initialize map
for (CallableStatus status : CallableStatus.values())
totals.put(status, 0);
// sum up all the callable statuses for each locus
for (int i = 0; i < interval.size(); i++) {
for (CallableStatus status : callableStatus(i)) {
int count = totals.get(status);
totals.put(status, count + 1);
}
}
Set<CallableStatus> output = new HashSet<CallableStatus>();
// double to avoid type casting
double intervalSize = interval.size();
double coverageStatusThreshold = 0.20;
if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold)
output.add(CallableStatus.NO_COVERAGE);
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold)
output.add(CallableStatus.LOW_COVERAGE);
double excessiveCoverageThreshold = 0.20;
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold)
output.add(CallableStatus.EXCESSIVE_COVERAGE);
double qualityStatusThreshold = 0.50;
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold)
output.add(CallableStatus.POOR_QUALITY);
if (totals.get(CallableStatus.REF_N) > 0)
output.add(CallableStatus.REF_N);
if (output.isEmpty()) {
output.add(CallableStatus.PASS);
}
return output;
}
/**
* Adds a locus to the interval wide stats
*
* @param locus The locus given as a GenomeLoc
* @param pileup The pileup of that locus
*/
public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) {
if (!interval.containsP(locus))
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
// a null pileup means there nothing ot add
if (pileup != null) {
int locusIndex = locus.getStart() - interval.getStart();
int rawCoverage = pileup.depthOfCoverage();
int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
loci.add(locusIndex, locusData);
}
}
/**
* returns the callable status of this locus without taking the reference base into account.
*
* @param locusIndex location in the genome to inquire (only one locus)
* @return the callable status of a locus
*/
private Set<CallableStatus> callableStatus(int locusIndex) {
LocusStatistics locus = loci.get(locusIndex);
return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold);
}
private void calculateTotalCoverage() {
preComputedTotalCoverage = 0;
for (LocusStatistics locus : loci)
preComputedTotalCoverage += locus.getCoverage();
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011, The Broad Institute
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -244,7 +244,8 @@ public class DiffEngine {
table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount());
table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString());
}
table.write(params.out);
GATKReport output = new GATKReport(table);
output.print(params.out);
}
protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) {

View File

@ -36,9 +36,14 @@ import java.io.IOException;
/**
* Class implementing diffnode reader for GATKReports
*/
// TODO Version check to be added at the report level
public class GATKReportDiffableReader implements DiffableReader {
@Override
public String getName() { return "GATKReport"; }
public String getName() {
return "GATKReport";
}
@Override
public DiffElement readFromFile(File file, int maxElementsToRead) {
@ -47,12 +52,12 @@ public class GATKReportDiffableReader implements DiffableReader {
// one line reads the whole thing into memory
GATKReport report = new GATKReport(file);
for (GATKReportTable table : report.getTables() ) {
for (GATKReportTable table : report.getTables()) {
root.add(tableToNode(table, root));
}
return root.getBinding();
} catch ( Exception e ) {
} catch (Exception e) {
return null;
}
}
@ -62,9 +67,8 @@ public class GATKReportDiffableReader implements DiffableReader {
tableRoot.add("Description", table.getTableDescription());
tableRoot.add("NumberOfRows", table.getNumRows());
tableRoot.add("Version", table.getVersion());
for ( GATKReportColumn column : table.getColumns().values() ) {
for (GATKReportColumn column : table.getColumns().values()) {
DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot);
columnRoot.add("Width", column.getColumnFormat().getWidth());
@ -72,7 +76,7 @@ public class GATKReportDiffableReader implements DiffableReader {
columnRoot.add("Displayable", column.isDisplayable());
int n = 1;
for ( Object elt : column.values() ) {
for (Object elt : column.values()) {
String name = column.getColumnName() + n++;
columnRoot.add(name, elt.toString());
}
@ -91,7 +95,7 @@ public class GATKReportDiffableReader implements DiffableReader {
new FileReader(file).read(buff, 0, HEADER.length());
String firstLine = new String(buff);
return firstLine.startsWith(HEADER);
} catch ( IOException e ) {
} catch (IOException e) {
return false;
}
}

View File

@ -68,8 +68,8 @@ public class VCFDiffableReader implements DiffableReader {
VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader);
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
String key = headerLine.getKey();
if ( headerLine instanceof VCFNamedHeaderLine )
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
if ( headerLine instanceof VCFIDHeaderLine)
key += "_" + ((VCFIDHeaderLine) headerLine).getID();
if ( root.hasElement(key) )
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
else

View File

@ -26,10 +26,14 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
@ -41,7 +45,8 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
/** The default model with the best performance in all cases */
EXACT
EXACT,
POOL
}
protected int N;
@ -61,6 +66,42 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
this.verboseWriter = verboseWriter;
}
/**
* Wrapper class that compares two likelihoods associated with two alleles
*/
protected static final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public Allele allele;
public LikelihoodSum(Allele allele) { this.allele = allele; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
/**
* Unpack GenotypesContext into arraylist of doubel values
* @param GLs Input genotype context
* @return ArrayList of doubles corresponding to GL vectors
*/
protected static ArrayList<double[]> getGLs(GenotypesContext GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
if ( sample.hasLikelihoods() ) {
double[] gls = sample.getLikelihoods().getAsVector();
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
@ -69,6 +110,19 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
}

View File

@ -25,6 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: ebanks
@ -34,23 +38,50 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
*/
public class AlleleFrequencyCalculationResult {
// IMPORTANT NOTE:
// These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N).
// For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that
// the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may
// be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0).
// In the bi-allelic case (where there are no other alternate alleles over which to marginalize),
// the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED.
final double[][] log10AlleleFrequencyLikelihoods;
final double[][] log10AlleleFrequencyPosteriors;
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
double log10LikelihoodOfAFzero = 0.0;
double log10PosteriorOfAFzero = 0.0;
// The posteriors seen, not including that of AF=0
private static final int POSTERIORS_CACHE_SIZE = 5000;
private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE];
private int currentPosteriorsCacheIndex = 0;
private Double log10PosteriorMatrixSum = null;
public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
public double getLog10MLE() {
return log10MLE;
}
public double getLog10MAP() {
return log10MAP;
}
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
if ( log10PosteriorMatrixSum == null ) {
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
}
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
public double getLog10LikelihoodOfAFzero() {
@ -60,4 +91,60 @@ public class AlleleFrequencyCalculationResult {
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToPosteriorsCache(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
private void addToPosteriorsCache(final double log10LofK) {
// add to the cache
log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) {
final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
log10PosteriorMatrixValues[0] = temporarySum;
currentPosteriorsCacheIndex = 1;
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
}

View File

@ -0,0 +1,294 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Code for determining which indels are segregating among the samples.
*
* This code is just a refactor of the original code from Guillermo in the UG.
*
* @author Mark DePristo
* @since 3/26/12
*/
public class ConsensusAlleleCounter {
final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class);
private final int minIndelCountForGenotyping;
private final boolean doMultiAllelicCalls;
private final double minFractionInOneSample;
private final GenomeLocParser locParser;
public ConsensusAlleleCounter(final GenomeLocParser locParser,
final boolean doMultiAllelicCalls,
final int minIndelCountForGenotyping,
final double minFractionInOneSample) {
this.minIndelCountForGenotyping = minIndelCountForGenotyping;
this.doMultiAllelicCalls = doMultiAllelicCalls;
this.minFractionInOneSample = minFractionInOneSample;
this.locParser = locParser;
}
/**
* Returns a list of Alleles at this locus that may be segregating
*
* @param ref
* @param contexts
* @param contextType
* @return
*/
public List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final Map<String, Integer> consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType);
// logger.info("Alleles at " + ref.getLocus());
// for ( Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet() ) {
// logger.info(" " + elt.getValue() + " => " + elt.getKey());
// }
return consensusCountsToAlleles(ref, consensusIndelStrings);
}
//
// TODO -- WARNING DOESN'T WORK WITH REDUCED READS
//
private Map<String, Integer> countConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final GenomeLoc loc = ref.getLocus();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if ( context.hasBasePileup() ) {
final ReadBackedPileup indelPileup = context.getBasePileup();
insCount += indelPileup.getNumberOfInsertionsAfterThisElement();
delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
}
}
if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping )
return Collections.emptyMap();
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if ( !context.hasBasePileup() )
continue;
final ReadBackedPileup indelPileup = context.getBasePileup();
final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement();
final int nReadsOverall = indelPileup.getNumberOfElements();
if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) {
// if ( nIndelReads > 0 )
// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
continue;
// } else {
// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
}
for (PileupElement p : indelPileup) {
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if ( p.isBeforeInsertion() ) {
// edge case: ignore a deletion immediately preceding an insertion as p.getEventBases() returns null [EB]
if ( indelString == null )
continue;
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (String s : consensusIndelStrings.keySet()) {
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(indelString)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(indelString)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
cList.add(new Pair<String, Integer>(indelString,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if ( p.isBeforeDeletedBase() ) {
indelString = String.format("D%d",p.getEventLength());
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
consensusIndelStrings.put(indelString,cnt+1);
}
}
}
return consensusIndelStrings;
}
private List<Allele> consensusCountsToAlleles(final ReferenceContext ref,
final Map<String, Integer> consensusIndelStrings) {
final GenomeLoc loc = ref.getLocus();
final Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
Allele refAllele, altAllele;
for (final Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet()) {
final String s = elt.getKey();
final int curCnt = elt.getValue();
int stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
final int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(Arrays.asList(refAllele, altAllele));
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls) {
vcs.add(builder.make());
if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
break;
} else if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
return mergedVC.getAlleles();
}
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
@Deprecated
public enum DiploidGenotype {
AA ('A', 'A'),
AC ('A', 'C'),

View File

@ -0,0 +1,125 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
public enum DiploidGenotypeWithCorrectAlleleOrdering {
AA ('A', 'A'),
AC ('A', 'C'),
CC ('C', 'C'),
AG ('A', 'G'),
CG ('C', 'G'),
GG ('G', 'G'),
AT ('A', 'T'),
CT ('C', 'T'),
GT ('G', 'T'),
TT ('T', 'T');
public byte base1, base2;
@Deprecated
private DiploidGenotypeWithCorrectAlleleOrdering(char base1, char base2) {
this((byte)base1, (byte)base2);
}
private DiploidGenotypeWithCorrectAlleleOrdering(byte base1, byte base2) {
this.base1 = base1;
this.base2 = base2;
}
public boolean isHomRef(byte r) {
return isHom() && r == base1;
}
public boolean isHomVar(byte r) {
return isHom() && r != base1;
}
public boolean isHetRef(byte r) {
if ( base1 == r )
return r != base2;
else
return base2 == r;
}
public boolean isHom() {
return ! isHet();
}
public boolean isHet() {
return base1 != base2;
}
/**
* create a diploid genotype, given a character to make into a hom genotype
* @param hom the character to turn into a hom genotype, i.e. if it is A, then returned will be AA
* @return the diploid genotype
*/
public static DiploidGenotypeWithCorrectAlleleOrdering createHomGenotype(byte hom) {
int index = BaseUtils.simpleBaseToBaseIndex(hom);
if ( index == -1 )
throw new IllegalArgumentException(hom + " is not a valid base character");
return conversionMatrix[index][index];
}
/**
* create a diploid genotype, given 2 chars which may not necessarily be ordered correctly
* @param base1 base1
* @param base2 base2
* @return the diploid genotype
*/
public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(byte base1, byte base2) {
int index1 = BaseUtils.simpleBaseToBaseIndex(base1);
if ( index1 == -1 )
throw new IllegalArgumentException(base1 + " is not a valid base character");
int index2 = BaseUtils.simpleBaseToBaseIndex(base2);
if ( index2 == -1 )
throw new IllegalArgumentException(base2 + " is not a valid base character");
return conversionMatrix[index1][index2];
}
/**
* create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly
* @param baseIndex1 base1
* @param baseIndex2 base2
* @return the diploid genotype
*/
public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(int baseIndex1, int baseIndex2) {
if ( baseIndex1 == -1 )
throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character");
if ( baseIndex2 == -1 )
throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character");
return conversionMatrix[baseIndex1][baseIndex2];
}
private static final DiploidGenotypeWithCorrectAlleleOrdering[][] conversionMatrix = {
{ DiploidGenotypeWithCorrectAlleleOrdering.AA, DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.AT },
{ DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.CC, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.CT },
{ DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.GG, DiploidGenotypeWithCorrectAlleleOrdering.GT },
{ DiploidGenotypeWithCorrectAlleleOrdering.AT, DiploidGenotypeWithCorrectAlleleOrdering.CT, DiploidGenotypeWithCorrectAlleleOrdering.GT, DiploidGenotypeWithCorrectAlleleOrdering.TT }
};
}

View File

@ -1,122 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.MathUtils;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: Sep 30, 2010
* Time: 1:47:55 PM
* To change this template use File | Settings | File Templates.
*/
public class DiploidIndelGenotypePriors implements GenotypePriors {
// --------------------------------------------------------------------------------------------------------------
//
// Constants and static information
//
// --------------------------------------------------------------------------------------------------------------
public static final double INDEL_HETEROZYGOSITY = 1e-4;
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
// --------------------------------------------------------------------------------------------------------------
//
// Diploid priors
//
// --------------------------------------------------------------------------------------------------------------
private double[] priors = null;
/**
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
*/
public DiploidIndelGenotypePriors() {
priors = flatPriors.clone();
}
public DiploidIndelGenotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) {
double varPrior = getHaplotypePriors(indelHeterozygosity, eventLength, haplotypeSize);
priors[2] = Math.log10(varPrior*varPrior);
priors[1] = Math.log10(2*varPrior*(1-varPrior));
priors[0] = Math.log10((1-varPrior)*(1-varPrior));
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors;
}
/**
* Returns the prior associated with DiploidGenotype g
* @param g
* @return log10 prior as a double
*/
public double getPrior(DiploidGenotype g) {
return getPriors()[g.ordinal()];
}
public double getHeterozygosity() { return INDEL_HETEROZYGOSITY; }
public boolean validate(boolean throwException) {
try {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
public double getHaplotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) {
// compute prior likelihoods on haplotypes.
// In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption),
// and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of
// another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances
// by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval
// of size L is Poisson-distributed with parameter lambda = theta*L.
// Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it
// by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and
// pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda.
// We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006)
// and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)),
// taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one.
// so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L).
// Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval)
// and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be
// added to ref hypothesis.
// Equation above is prior model.
double lambda = (double)haplotypeSize * indelHeterozygosity;
return HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
}
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
}
}
}

View File

@ -70,6 +70,7 @@ import static java.lang.Math.pow;
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
@Deprecated
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;

View File

@ -0,0 +1,489 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(G | D) = P(G) * P(D | G)
*
* where
*
* P(D | G) = sum_i log10 P(bi | G)
*
* and
*
* P(bi | G) = 1 - P(error | q1) if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for homozygous genotypes and for heterozygous genotypes:
*
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
*
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
*
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);
protected final static double log10_3 = log10(3.0);
protected boolean VERBOSE = false;
//
// The fundamental data arrays associated with a Genotype Likelihoods object
//
protected double[] log10Likelihoods = null;
// TODO: don't calculate this each time through
protected double log10_PCR_error_3;
protected double log10_1_minus_PCR_error;
/**
* Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype
*
* @param PCR_error_rate the PCR error rate
*/
public DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(double PCR_error_rate) {
log10_PCR_error_3 = log10(PCR_error_rate) - log10_3;
log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate);
setToZero();
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering c = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)super.clone();
c.log10Likelihoods = log10Likelihoods.clone();
return c;
}
protected void setToZero() {
log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros
}
/**
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
* @return likelihoods array
*/
public double[] getLikelihoods() {
return log10Likelihoods;
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param ignoreBadBases should we ignore bad bases?
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
* @param minBaseQual the minimum base quality at which to consider a base valid
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
int n = 0;
// for each fragment, add to the likelihoods
FragmentCollection<PileupElement> fpile = pileup.toFragments();
for ( PileupElement p : fpile.getSingletonReads() )
n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
for ( List<PileupElement> overlappingPair : fpile.getOverlappingPairs() )
n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
return n;
}
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
byte obsBase = elt.getBase();
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qual == 0 )
return 0;
if ( elt.getRead().isReducedRead() ) {
// reduced read representation
if ( BaseUtils.isRegularBase( obsBase )) {
int representativeCount = elt.getRepresentativeCount();
add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods
return representativeCount; // we added nObs bases here
}
// odd bases or deletions => don't use them
return 0;
}
return add(obsBase, qual, (byte)0, (byte)0, 1);
}
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
final PileupElement p1 = overlappingPair.get(0);
final PileupElement p2 = overlappingPair.get(1);
final byte observedBase1 = p1.getBase();
final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
final byte observedBase2 = p2.getBase();
final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qualityScore1 == 0 ) {
if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
return 0;
else {
return add(observedBase2, qualityScore2, (byte)0, (byte)0);
}
} else {
return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
}
}
/**
*
* @param obsBase1 first observed base
* @param qual1 base qual of first observed base
* @param obsBase2 second observed base
* @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment
* @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads
* @return 0 if the base is bad, 1 otherwise
*/
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
// Just look up the cached result if it's available, or compute and store it
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl;
if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
} else {
gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
}
// for bad bases, there are no likelihoods
if ( gl == null )
return 0;
double[] likelihoods = gl.getLikelihoods();
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
double likelihood = likelihoods[g.ordinal()];
log10Likelihoods[g.ordinal()] += likelihood * nObs;
}
return 1;
}
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
return add(obsBase1, qual1, obsBase2, qual2, 1);
}
// -------------------------------------------------------------------------------------
//
// Dealing with the cache routines
//
// -------------------------------------------------------------------------------------
static DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] CACHE = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null;
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy);
if ( gl == null )
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d",
observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy));
return gl;
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl);
return gl;
}
protected void setCache( DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache,
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy,
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering val ) {
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
int j = qualityScore1;
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
int l = qualityScore2;
int m = ploidy;
cache[i][j][k][l][m] = val;
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCache(DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache,
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
int j = qualityScore1;
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
int l = qualityScore2;
int m = ploidy;
return cache[i][j][k][l][m];
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
try {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)this.clone();
gl.setToZero();
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
double p_base = 0.0;
p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
final double likelihood = log10(p_base);
gl.log10Likelihoods[g.ordinal()] += likelihood;
}
if ( VERBOSE ) {
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%s\t", g); }
System.out.println();
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
System.out.println();
}
return gl;
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
}
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase1 the base observed on the 1st read of the fragment
* @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA
* @param observedBase2 the base observed on the 2nd read of the fragment
* @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
*/
protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
double[] log10FourBaseLikelihoods = baseZeros.clone();
for ( byte trueBase : BaseUtils.BASES ) {
double likelihood = 0.0;
for ( byte fragmentBase : BaseUtils.BASES ) {
double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3);
if ( qualityScore1 != 0 ) {
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1);
}
if ( qualityScore2 != 0 ) {
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2);
}
//if ( VERBOSE ) {
// System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n",
// observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood);
//}
likelihood += pow(10, log10FragmentLikelihood);
}
log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood);
}
return log10FourBaseLikelihoods;
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param qual base quality
* @return log10 likelihood
*/
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
double logP;
if ( observedBase == chromBase ) {
// the base is consistent with the chromosome -- it's 1 - e
//logP = oneMinusData[qual];
double e = pow(10, (qual / -10.0));
logP = log10(1.0 - e);
} else {
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
logP = qual / -10.0 + (-log10_3);
}
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
return logP;
}
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
* cap the quality score by the mapping quality of the read itself.
*
* @param p Pileup element
* @param ignoreBadBases Should we ignore bad bases?
* @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read?
* @param minBaseQual Minimum allowed base quality
* @return the actual base quality to use
*/
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
return 0;
byte qual = p.getQual();
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
if ( capBaseQualsAtMappingQual )
qual = (byte)Math.min((int)qual, p.getMappingQual());
if ( (int)qual < minBaseQual )
qual = (byte)0;
return qual;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for (DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values()) {
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
String bad = null;
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
}
if ( bad != null ) {
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
//
// Constant static data
//
private final static double[] genotypeZeros = new double[DiploidGenotypeWithCorrectAlleleOrdering.values().length];
private final static double[] baseZeros = new double[BaseUtils.BASES.length];
static {
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
genotypeZeros[g.ordinal()] = 0.0;
}
for ( byte base : BaseUtils.BASES ) {
baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
}
}
}

View File

@ -29,6 +29,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
@Deprecated
public class DiploidSNPGenotypePriors implements GenotypePriors {
// --------------------------------------------------------------------------------------------------------------
//

View File

@ -43,7 +43,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
@ -56,26 +56,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false);
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
}
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
return alleles;
}
private static final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public Allele allele;
public LikelihoodSum(Allele allele) { this.allele = allele; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static final List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
@ -113,22 +101,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return orderedBestAlleles;
}
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
if ( sample.hasLikelihoods() ) {
double[] gls = sample.getLikelihoods().getAsVector();
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
// -------------------------------------------------------------------------------------
//
// Multi-allelic implementation.
@ -153,7 +125,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
@Override
public boolean equals(Object obj) {
return (obj instanceof ExactACcounts) ? Arrays.equals(counts, ((ExactACcounts)obj).counts) : false;
return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts)obj).counts);
}
@Override
@ -203,24 +175,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
public boolean equals(Object obj) {
return (obj instanceof ExactACset) ? ACcounts.equals(((ExactACset)obj).ACcounts) : false;
return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts);
}
}
// TODO -- remove me
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final boolean foo) {
linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result);
}
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
@ -272,7 +233,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
//if ( DEBUG )
@ -360,7 +321,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
set.log10Likelihoods[0] = 0.0; // the zero case
@ -370,47 +331,39 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( totalK == 0 ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// k > 0 for at least one k
else {
// the non-AA possible conformations were dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
}
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// determine the power of theta to use
int nonRefAlleles = 0;
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
if ( set.ACcounts.getCounts()[i] > 0 )
nonRefAlleles++;
}
// for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object
if ( nonRefAlleles == 0 ) {
result.log10LikelihoodOfAFzero = log10LofK;
result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0];
} else {
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
int AC = set.ACcounts.getCounts()[i];
result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
}
// update the MLE if necessary
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
// apply the priors over each alternate allele
for ( final int ACcount : set.ACcounts.getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
}
private static void pushData(final ExactACset targetSet,
@ -466,6 +419,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
}
// -------------------------------------------------------------------------------------
//

View File

@ -47,9 +47,17 @@ import java.util.Map;
*/
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
/* public enum Model {
SNP,
INDEL,
BOTH
}
*/
public enum Model {
SNP,
INDEL,
POOLSNP,
POOLINDEL,
BOTH
}
@ -60,7 +68,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
GENOTYPE_GIVEN_ALLELES
}
protected UnifiedArgumentCollection UAC;
protected final UnifiedArgumentCollection UAC;
protected Logger logger;
/**
@ -70,7 +78,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
*/
protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
if ( logger == null || UAC == null ) throw new ReviewedStingException("Bad arguments");
this.UAC = UAC.clone();
this.UAC = UAC;
this.logger = logger;
}
@ -81,7 +89,6 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param priors priors to use for GLs
* @param alternateAllelesToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser
@ -91,7 +98,6 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser);

View File

@ -35,16 +35,9 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
@ -52,11 +45,9 @@ import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int HAPLOTYPE_SIZE;
private final int minIndelCountForGenotyping;
private final boolean getAlleleListFromVCF;
private boolean DEBUG = false;
private final boolean doMultiAllelicCalls = true;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private PairHMMIndelErrorModel pairModel;
@ -72,7 +63,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// gdebug removeme
// todo -cleanup
private GenomeLoc lastSiteVisited;
private ArrayList<Allele> alleleList;
private List<Allele> alleleList = new ArrayList<Allele>();
static {
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
@ -83,204 +74,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
}
private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) {
Allele refAllele = null, altAllele = null;
GenomeLoc loc = ref.getLocus();
ArrayList<Allele> aList = new ArrayList<Allele>();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
insCount += indelPileup.getNumberOfInsertions();
delCount += indelPileup.getNumberOfDeletions();
}
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return aList;
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
//SAMRecord read = p.getRead();
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if (p.isInsertion()) {
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (String s : consensusIndelStrings.keySet()) {
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(indelString)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(indelString)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
cList.add(new Pair<String, Integer>(indelString,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if (p.isDeletion()) {
indelString = String.format("D%d",p.getEventLength());
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
consensusIndelStrings.put(indelString,cnt+1);
}
}
}
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
String bestAltAllele = "";
for (String s : consensusIndelStrings.keySet()) {
int curCnt = consensusIndelStrings.get(s), stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
ArrayList vcAlleles = new ArrayList<Allele>();
vcAlleles.add(refAllele);
vcAlleles.add(altAllele);
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(vcAlleles);
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls) {
vcs.add(builder.make());
if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
break;
} else if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
if (vcs.isEmpty())
return aList; // nothing else to do, no alleles passed minimum count criterion
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
aList = new ArrayList<Allele>(mergedVC.getAlleles());
return aList;
protected List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenomeLocParser locParser) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
@ -289,7 +95,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
@ -348,8 +153,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
return null;
if (!(priors instanceof DiploidIndelGenotypePriors))
throw new StingException("Only diploid-based Indel priors are supported in the INDEL GL model");
if (alleleList.isEmpty())
return null;
@ -370,7 +173,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
final int hsize = (int) ref.getWindow().size() - Math.abs(eventLength) - 1;
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
if (hsize <= 0) {
@ -395,26 +198,23 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
ReadBackedPileup pileup = null;
if (context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
if (context.hasBasePileup()) {
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
if (pileup != null) {
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
for (int k = 0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
for (int k = 0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
}
}
}
}

Some files were not shown because too many files have changed in this diff Show More