Bugfix for minBaseQuality to ignore deletion reads. LocusMismatch walker now allows us to skip every nths eligable site
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2357 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bf7bab754e
commit
0d2a761460
|
|
@ -1,4 +1,4 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.coverage;
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||||
|
|
@ -31,6 +31,9 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
||||||
@Argument(fullName="minMismatches", doc = "Minimum number of mismatches at a locus before a site is displayed", required = false)
|
@Argument(fullName="minMismatches", doc = "Minimum number of mismatches at a locus before a site is displayed", required = false)
|
||||||
int minMismatches = 1;
|
int minMismatches = 1;
|
||||||
|
|
||||||
|
@Argument(fullName="skip", doc = "Only display every skip eligable sites. Defaults to all sites", required = false)
|
||||||
|
int skip = 1;
|
||||||
|
|
||||||
private UnifiedGenotyper ug;
|
private UnifiedGenotyper ug;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
@ -54,9 +57,12 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduce( String map, Integer reduce ) {
|
public Integer reduce( String map, Integer reduce ) {
|
||||||
if ( map != null )
|
if ( map != null && (reduce % skip == 0) )
|
||||||
out.println(map);
|
out.println(map);
|
||||||
return reduce;
|
|
||||||
|
//if (reduce % skip == 0) System.out.printf("Keeping %d%n", reduce);
|
||||||
|
|
||||||
|
return reduce + (map != null ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer treeReduce( Integer reduce1, Integer reduce2 ) {
|
public Integer treeReduce( Integer reduce1, Integer reduce2 ) {
|
||||||
|
|
@ -65,7 +71,7 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
out.printf("loc ref depth nMM qSumMM A C G T%n");
|
out.printf("loc ref depth nMM qSumMM A C G T%n");
|
||||||
return null;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String errorCounts( ReferenceContext ref, ReadBackedPileup pileup ) {
|
private String errorCounts( ReferenceContext ref, ReadBackedPileup pileup ) {
|
||||||
|
|
|
||||||
|
|
@ -169,7 +169,7 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
|
||||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||||
|
|
||||||
for ( PileupElement p : pileup ) {
|
for ( PileupElement p : pileup ) {
|
||||||
if ( p.getRead().getMappingQuality() >= minMapQ && p.getQual() >= minBaseQ ) {
|
if ( p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ) ) {
|
||||||
filteredPileup.add(p);
|
filteredPileup.add(p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,34 +3,58 @@ import sys
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
from vcfReader import *
|
from vcfReader import *
|
||||||
#import pylab
|
#import pylab
|
||||||
|
import operator
|
||||||
from itertools import *
|
from itertools import *
|
||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
|
||||||
class CallCovariate:
|
class Range:
|
||||||
def __init__(self, feature, left, right, FPRate = None, cumulative = False):
|
ANY = '*'
|
||||||
self.feature = feature
|
def __init__(self, left = ANY, right = ANY, leftOpen = False, rightOpen = False):
|
||||||
self.left = left
|
self.left = left
|
||||||
|
self.right = right
|
||||||
|
self.leftOpen = leftOpen
|
||||||
|
self.rightOpen = rightOpen
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
leftB, rightB = '[', ']'
|
||||||
|
if self.leftOpen: leftB = '('
|
||||||
|
if self.rightOpen: rightB = ')'
|
||||||
|
return '%s%s, %s%s' % (leftB, self.left, self.right, rightB)
|
||||||
|
|
||||||
|
__repr__ = __str__
|
||||||
|
|
||||||
|
def dashedString(self):
|
||||||
|
return str(self).replace(", ", "-")
|
||||||
|
|
||||||
|
def contains(self, v):
|
||||||
|
def test(r, op, open):
|
||||||
|
return r == Range.ANY or op(v, r) or (not open and v == r)
|
||||||
|
return test(self.left, operator.__gt__, self.leftOpen) and test(self.right, operator.__lt__, self.rightOpen)
|
||||||
|
|
||||||
if cumulative:
|
class CallCovariate:
|
||||||
self.right = '*'
|
def __init__(self, feature, featureRange, qualRange, FPRate = None):
|
||||||
else:
|
self.feature = feature
|
||||||
self.right = right
|
|
||||||
|
self.featureRange = featureRange
|
||||||
|
|
||||||
|
self.qualRange = qualRange
|
||||||
self.FPRate = FPRate
|
self.FPRate = FPRate
|
||||||
|
|
||||||
def containsVariant(self, call):
|
def containsVariant(self, call):
|
||||||
fieldVal = call.getField(self.feature)
|
inFeature = self.featureRange.contains(call.getField(self.feature))
|
||||||
return fieldVal >= self.left and (self.right == '*' or fieldVal <= self.right)
|
inQual = self.qualRange.contains(call.getQual())
|
||||||
|
#print 'inFeature, inQual', inFeature, inQual
|
||||||
|
return inFeature and inQual
|
||||||
|
|
||||||
def getFPRate(self): return self.FPRate
|
def getFPRate(self): return self.FPRate
|
||||||
def getFeature(self): return self.feature
|
def getFeature(self): return self.feature
|
||||||
|
|
||||||
def getCovariateField(self): return self.getFeature() + '_RQ'
|
def getCovariateField(self): return self.getFeature() + '_RQ'
|
||||||
|
|
||||||
def __str__(self): return "[CC feature=%s left=%s right=%s]" % (self.feature, self.left, self.right)
|
def __str__(self): return "[CC feature=%s range=%s qualRange=%s]" % (self.feature, self.featureRange, self.qualRange)
|
||||||
|
|
||||||
class RecalibratedCall:
|
class RecalibratedCall:
|
||||||
def __init__(self, call, features):
|
def __init__(self, call, features):
|
||||||
|
|
@ -110,7 +134,7 @@ def titv(variants):
|
||||||
|
|
||||||
def dbSNPRate(variants):
|
def dbSNPRate(variants):
|
||||||
inDBSNP = len(filter(VCFRecord.isKnown, variants))
|
inDBSNP = len(filter(VCFRecord.isKnown, variants))
|
||||||
return float(inDBSNP) / len(variants)
|
return float(inDBSNP) / max(len(variants),1)
|
||||||
|
|
||||||
def gaussian(x, mu, sigma):
|
def gaussian(x, mu, sigma):
|
||||||
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
|
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
|
||||||
|
|
@ -257,11 +281,11 @@ def printFieldQualHeader():
|
||||||
more = CallCmp.HEADER
|
more = CallCmp.HEADER
|
||||||
def p(stream):
|
def p(stream):
|
||||||
if stream <> None:
|
if stream <> None:
|
||||||
print >> stream, ' %20s %20s left right nVariants nNovels titv titvNovels dbSNP FPEstimate Q' % ("category", "field"), more
|
print >> stream, ' %20s %20s left right %15s nVariants nNovels titv titvNovels dbSNP Q' % ("category", "field", "qRange"), more
|
||||||
p(sys.stdout)
|
p(sys.stdout)
|
||||||
p(RECAL_LOG)
|
p(RECAL_LOG)
|
||||||
|
|
||||||
def printFieldQual( category, field, left, right, variants, FPRate ):
|
def printFieldQual( category, field, cc, variants ):
|
||||||
more = ""
|
more = ""
|
||||||
if TRUTH_CALLS <> None:
|
if TRUTH_CALLS <> None:
|
||||||
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
|
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
|
||||||
|
|
@ -269,7 +293,7 @@ def printFieldQual( category, field, left, right, variants, FPRate ):
|
||||||
novels = selectVariants(variants, VCFRecord.isNovel)
|
novels = selectVariants(variants, VCFRecord.isNovel)
|
||||||
def p(stream):
|
def p(stream):
|
||||||
if stream <> None:
|
if stream <> None:
|
||||||
print >> stream, ' %20s %20s %s %8d %8d %.2f %.2f %.2f %.2e %3d' % (category, field, binString(field, left, right), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants), FPRate, phredScale(FPRate)), more
|
print >> stream, ' %20s %20s %15s %15s %8d %8d %2.2f %2.2f %3.2f %3d' % (category, field, binString(field, cc.featureRange), cc.qualRange.dashedString(), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants) * 100, phredScale(cc.FPRate)), more
|
||||||
p(sys.stdout)
|
p(sys.stdout)
|
||||||
p(RECAL_LOG)
|
p(RECAL_LOG)
|
||||||
|
|
||||||
|
|
@ -286,8 +310,9 @@ def captureFieldRangeForPrinting(field, sortedValues):
|
||||||
def isNumber(x):
|
def isNumber(x):
|
||||||
return isinstance(x, (int, long, float))
|
return isinstance(x, (int, long, float))
|
||||||
|
|
||||||
def binString(field, left, right):
|
def binString(field, cc):
|
||||||
epsilon = 1e-2
|
epsilon = 1e-2
|
||||||
|
left, right = cc.left, cc.right
|
||||||
leftStr = str(left)
|
leftStr = str(left)
|
||||||
rightStr = "%5s" % str(right)
|
rightStr = "%5s" % str(right)
|
||||||
if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES:
|
if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES:
|
||||||
|
|
@ -325,7 +350,7 @@ def recalibrateCalls(variants, fields, callCovariates):
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
def optimizeCalls(variants, fields, titvTarget):
|
def optimizeCalls(variants, fields, titvTarget):
|
||||||
callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates")
|
callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates", useBreaks=True)
|
||||||
recalCalls = recalibrateCalls(variants, fields, callCovariates)
|
recalCalls = recalibrateCalls(variants, fields, callCovariates)
|
||||||
return recalCalls, callCovariates
|
return recalCalls, callCovariates
|
||||||
|
|
||||||
|
|
@ -341,26 +366,33 @@ def all( p, l ):
|
||||||
if not p(elt): return False
|
if not p(elt): return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def variantBinsForField(variants, field):
|
|
||||||
#if not all( lambda x: x.hasField(field), variants):
|
|
||||||
# raise Exception('Unknown field ' + field)
|
|
||||||
|
|
||||||
minValue, maxValue, bins = fieldRange(variants, field)
|
|
||||||
if DEBUG: print 'Field range', minValue, maxValue
|
|
||||||
if DEBUG: print 'Partitions', bins
|
|
||||||
return bins
|
|
||||||
|
|
||||||
def mapVariantBins(variants, field, cumulative = False):
|
def mapVariantBins(variants, field, cumulative = False, breakQuals = [Range()]):
|
||||||
bins = variantBinsForField(variants, field)
|
minValue, maxValue, featureBins = fieldRange(variants, field)
|
||||||
|
|
||||||
def variantsInBin(bin):
|
|
||||||
cc = CallCovariate(field, bin[0], bin[1], cumulative = cumulative)
|
|
||||||
|
|
||||||
return cc.left, cc.right, selectVariants(variants, lambda v: cc.containsVariant(v))
|
#print 'BREAKQuals', breakQuals[0]
|
||||||
|
bins = [(x,y) for x in featureBins for y in breakQuals]
|
||||||
|
#print 'BINS', bins
|
||||||
|
|
||||||
|
def variantsInBin(featureBin, qualRange):
|
||||||
|
right = featureBin[1]
|
||||||
|
if cumulative:
|
||||||
|
right = Range.ANY
|
||||||
|
cc = CallCovariate(field, Range(featureBin[0], right), qualRange)
|
||||||
|
|
||||||
|
return cc, selectVariants(variants, lambda v: cc.containsVariant(v))
|
||||||
|
|
||||||
return imap( variantsInBin, bins )
|
#sys.exit(1)
|
||||||
|
return starmap( variantsInBin, bins )
|
||||||
|
|
||||||
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None ):
|
def qBreaksRanges(qBreaks, useBreaks):
|
||||||
|
if qBreaks == None or not useBreaks:
|
||||||
|
return [Range()] # include everything in a single range
|
||||||
|
else:
|
||||||
|
breaks = map(float, qBreaks.split(','))
|
||||||
|
return map(lambda x, y: Range(x,y, rightOpen = True), chain([Range.ANY], breaks), chain(breaks, [Range.ANY]))
|
||||||
|
|
||||||
|
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None, useBreaks = False ):
|
||||||
covariates = []
|
covariates = []
|
||||||
|
|
||||||
if printHeader: printFieldQualHeader()
|
if printHeader: printFieldQualHeader()
|
||||||
|
|
@ -370,14 +402,16 @@ def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulativ
|
||||||
titv, FPRate = titvFPRateEstimate(variants, titvTarget)
|
titv, FPRate = titvFPRateEstimate(variants, titvTarget)
|
||||||
#print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate)
|
#print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate)
|
||||||
|
|
||||||
for left, right, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative):
|
for cc, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative, breakQuals = qBreaksRanges(OPTIONS.QBreaks, useBreaks and field <> 'QUAL')):
|
||||||
|
#print 'CC', cc, field, useBreaks
|
||||||
if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint:
|
if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint:
|
||||||
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
|
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
|
||||||
dbsnp = dbSNPRate(selectedVariants)
|
#dbsnp = dbSNPRate(selectedVariants)
|
||||||
covariates.append(CallCovariate(field, left, right, FPRate))
|
cc.FPRate = FPRate
|
||||||
printFieldQual( category, prefix + field, left, right, selectedVariants, FPRate )
|
covariates.append(cc)
|
||||||
|
printFieldQual( category, prefix + field, cc, selectedVariants )
|
||||||
else:
|
else:
|
||||||
print 'Not calibrating bin', left, right, 'because it contains too few variants:', len(selectedVariants)
|
print 'Not calibrating bin', cc, 'because it contains too few variants:', len(selectedVariants)
|
||||||
|
|
||||||
return covariates
|
return covariates
|
||||||
|
|
||||||
|
|
@ -462,9 +496,9 @@ def compareCalls(calls, truthCalls):
|
||||||
|
|
||||||
def compare1(name, cumulative):
|
def compare1(name, cumulative):
|
||||||
for field in ["QUAL", "OQ"]:
|
for field in ["QUAL", "OQ"]:
|
||||||
for left, right, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
|
for cc, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
|
||||||
#print selectedVariants[0]
|
#print selectedVariants[0]
|
||||||
printFieldQual("truth-comparison-" + name, field, left, right, selectedVariants, dephredScale(left))
|
printFieldQual("truth-comparison-" + name, field, cc, selectedVariants )
|
||||||
|
|
||||||
print 'PER BIN nCalls=', len(calls)
|
print 'PER BIN nCalls=', len(calls)
|
||||||
compare1('per-bin', False)
|
compare1('per-bin', False)
|
||||||
|
|
@ -519,6 +553,9 @@ def setup():
|
||||||
parser.add_option("-Q", "--qMin", dest="minQScore",
|
parser.add_option("-Q", "--qMin", dest="minQScore",
|
||||||
type='int', default=-1,
|
type='int', default=-1,
|
||||||
help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
|
help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
|
||||||
|
parser.add_option("", "--QBreaks", dest="QBreaks",
|
||||||
|
type='string', default=None,
|
||||||
|
help="Breaks in QUAL for generating covarites [default: %default]")
|
||||||
parser.add_option("-q", "--qMax", dest="maxQScore",
|
parser.add_option("-q", "--qMax", dest="maxQScore",
|
||||||
type='int', default=60,
|
type='int', default=60,
|
||||||
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue