Bugfix for minBaseQuality to ignore deletion reads. LocusMismatch walker now allows us to skip every nths eligable site

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2357 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-12-15 14:38:39 +00:00
parent bf7bab754e
commit 0d2a761460
3 changed files with 87 additions and 44 deletions

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.gatk.walkers.coverage;
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
@ -31,6 +31,9 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
@Argument(fullName="minMismatches", doc = "Minimum number of mismatches at a locus before a site is displayed", required = false)
int minMismatches = 1;
@Argument(fullName="skip", doc = "Only display every skip eligable sites. Defaults to all sites", required = false)
int skip = 1;
private UnifiedGenotyper ug;
public void initialize() {
@ -54,9 +57,12 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
}
public Integer reduce( String map, Integer reduce ) {
if ( map != null )
if ( map != null && (reduce % skip == 0) )
out.println(map);
return reduce;
//if (reduce % skip == 0) System.out.printf("Keeping %d%n", reduce);
return reduce + (map != null ? 1 : 0);
}
public Integer treeReduce( Integer reduce1, Integer reduce2 ) {
@ -65,7 +71,7 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
public Integer reduceInit() {
out.printf("loc ref depth nMM qSumMM A C G T%n");
return null;
return 1;
}
private String errorCounts( ReferenceContext ref, ReadBackedPileup pileup ) {

View File

@ -169,7 +169,7 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
for ( PileupElement p : pileup ) {
if ( p.getRead().getMappingQuality() >= minMapQ && p.getQual() >= minBaseQ ) {
if ( p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ) ) {
filteredPileup.add(p);
}
}

View File

@ -3,34 +3,58 @@ import sys
from optparse import OptionParser
from vcfReader import *
#import pylab
import operator
from itertools import *
import math
import random
DEBUG = False
class CallCovariate:
def __init__(self, feature, left, right, FPRate = None, cumulative = False):
self.feature = feature
class Range:
ANY = '*'
def __init__(self, left = ANY, right = ANY, leftOpen = False, rightOpen = False):
self.left = left
self.right = right
self.leftOpen = leftOpen
self.rightOpen = rightOpen
def __str__(self):
leftB, rightB = '[', ']'
if self.leftOpen: leftB = '('
if self.rightOpen: rightB = ')'
return '%s%s, %s%s' % (leftB, self.left, self.right, rightB)
__repr__ = __str__
def dashedString(self):
return str(self).replace(", ", "-")
def contains(self, v):
def test(r, op, open):
return r == Range.ANY or op(v, r) or (not open and v == r)
return test(self.left, operator.__gt__, self.leftOpen) and test(self.right, operator.__lt__, self.rightOpen)
if cumulative:
self.right = '*'
else:
self.right = right
class CallCovariate:
def __init__(self, feature, featureRange, qualRange, FPRate = None):
self.feature = feature
self.featureRange = featureRange
self.qualRange = qualRange
self.FPRate = FPRate
def containsVariant(self, call):
fieldVal = call.getField(self.feature)
return fieldVal >= self.left and (self.right == '*' or fieldVal <= self.right)
inFeature = self.featureRange.contains(call.getField(self.feature))
inQual = self.qualRange.contains(call.getQual())
#print 'inFeature, inQual', inFeature, inQual
return inFeature and inQual
def getFPRate(self): return self.FPRate
def getFeature(self): return self.feature
def getCovariateField(self): return self.getFeature() + '_RQ'
def __str__(self): return "[CC feature=%s left=%s right=%s]" % (self.feature, self.left, self.right)
def __str__(self): return "[CC feature=%s range=%s qualRange=%s]" % (self.feature, self.featureRange, self.qualRange)
class RecalibratedCall:
def __init__(self, call, features):
@ -110,7 +134,7 @@ def titv(variants):
def dbSNPRate(variants):
inDBSNP = len(filter(VCFRecord.isKnown, variants))
return float(inDBSNP) / len(variants)
return float(inDBSNP) / max(len(variants),1)
def gaussian(x, mu, sigma):
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
@ -257,11 +281,11 @@ def printFieldQualHeader():
more = CallCmp.HEADER
def p(stream):
if stream <> None:
print >> stream, ' %20s %20s left right nVariants nNovels titv titvNovels dbSNP FPEstimate Q' % ("category", "field"), more
print >> stream, ' %20s %20s left right %15s nVariants nNovels titv titvNovels dbSNP Q' % ("category", "field", "qRange"), more
p(sys.stdout)
p(RECAL_LOG)
def printFieldQual( category, field, left, right, variants, FPRate ):
def printFieldQual( category, field, cc, variants ):
more = ""
if TRUTH_CALLS <> None:
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
@ -269,7 +293,7 @@ def printFieldQual( category, field, left, right, variants, FPRate ):
novels = selectVariants(variants, VCFRecord.isNovel)
def p(stream):
if stream <> None:
print >> stream, ' %20s %20s %s %8d %8d %.2f %.2f %.2f %.2e %3d' % (category, field, binString(field, left, right), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants), FPRate, phredScale(FPRate)), more
print >> stream, ' %20s %20s %15s %15s %8d %8d %2.2f %2.2f %3.2f %3d' % (category, field, binString(field, cc.featureRange), cc.qualRange.dashedString(), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants) * 100, phredScale(cc.FPRate)), more
p(sys.stdout)
p(RECAL_LOG)
@ -286,8 +310,9 @@ def captureFieldRangeForPrinting(field, sortedValues):
def isNumber(x):
return isinstance(x, (int, long, float))
def binString(field, left, right):
def binString(field, cc):
epsilon = 1e-2
left, right = cc.left, cc.right
leftStr = str(left)
rightStr = "%5s" % str(right)
if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES:
@ -325,7 +350,7 @@ def recalibrateCalls(variants, fields, callCovariates):
#
#
def optimizeCalls(variants, fields, titvTarget):
callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates")
callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates", useBreaks=True)
recalCalls = recalibrateCalls(variants, fields, callCovariates)
return recalCalls, callCovariates
@ -341,26 +366,33 @@ def all( p, l ):
if not p(elt): return False
return True
def variantBinsForField(variants, field):
#if not all( lambda x: x.hasField(field), variants):
# raise Exception('Unknown field ' + field)
minValue, maxValue, bins = fieldRange(variants, field)
if DEBUG: print 'Field range', minValue, maxValue
if DEBUG: print 'Partitions', bins
return bins
def mapVariantBins(variants, field, cumulative = False):
bins = variantBinsForField(variants, field)
def variantsInBin(bin):
cc = CallCovariate(field, bin[0], bin[1], cumulative = cumulative)
def mapVariantBins(variants, field, cumulative = False, breakQuals = [Range()]):
minValue, maxValue, featureBins = fieldRange(variants, field)
return cc.left, cc.right, selectVariants(variants, lambda v: cc.containsVariant(v))
#print 'BREAKQuals', breakQuals[0]
bins = [(x,y) for x in featureBins for y in breakQuals]
#print 'BINS', bins
def variantsInBin(featureBin, qualRange):
right = featureBin[1]
if cumulative:
right = Range.ANY
cc = CallCovariate(field, Range(featureBin[0], right), qualRange)
return cc, selectVariants(variants, lambda v: cc.containsVariant(v))
return imap( variantsInBin, bins )
#sys.exit(1)
return starmap( variantsInBin, bins )
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None ):
def qBreaksRanges(qBreaks, useBreaks):
if qBreaks == None or not useBreaks:
return [Range()] # include everything in a single range
else:
breaks = map(float, qBreaks.split(','))
return map(lambda x, y: Range(x,y, rightOpen = True), chain([Range.ANY], breaks), chain(breaks, [Range.ANY]))
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None, useBreaks = False ):
covariates = []
if printHeader: printFieldQualHeader()
@ -370,14 +402,16 @@ def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulativ
titv, FPRate = titvFPRateEstimate(variants, titvTarget)
#print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate)
for left, right, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative):
for cc, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative, breakQuals = qBreaksRanges(OPTIONS.QBreaks, useBreaks and field <> 'QUAL')):
#print 'CC', cc, field, useBreaks
if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint:
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
dbsnp = dbSNPRate(selectedVariants)
covariates.append(CallCovariate(field, left, right, FPRate))
printFieldQual( category, prefix + field, left, right, selectedVariants, FPRate )
#dbsnp = dbSNPRate(selectedVariants)
cc.FPRate = FPRate
covariates.append(cc)
printFieldQual( category, prefix + field, cc, selectedVariants )
else:
print 'Not calibrating bin', left, right, 'because it contains too few variants:', len(selectedVariants)
print 'Not calibrating bin', cc, 'because it contains too few variants:', len(selectedVariants)
return covariates
@ -462,9 +496,9 @@ def compareCalls(calls, truthCalls):
def compare1(name, cumulative):
for field in ["QUAL", "OQ"]:
for left, right, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
for cc, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
#print selectedVariants[0]
printFieldQual("truth-comparison-" + name, field, left, right, selectedVariants, dephredScale(left))
printFieldQual("truth-comparison-" + name, field, cc, selectedVariants )
print 'PER BIN nCalls=', len(calls)
compare1('per-bin', False)
@ -519,6 +553,9 @@ def setup():
parser.add_option("-Q", "--qMin", dest="minQScore",
type='int', default=-1,
help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
parser.add_option("", "--QBreaks", dest="QBreaks",
type='string', default=None,
help="Breaks in QUAL for generating covarites [default: %default]")
parser.add_option("-q", "--qMax", dest="maxQScore",
type='int', default=60,
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")