From 23dbc5ccf306763edeee9132130bc90f33b816c7 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 5 Jan 2011 00:28:05 +0000 Subject: [PATCH] HaplotypeScore is revamped. It now uses reads' Cigar strings when building the haplotype blocks to skip over soft-clipped bases and factor in insertions and deletions. The statistic now uses only the reads from the filtered context to build the haplotypes but it scores all reads against the two best haplotypes. The score is now computed individually for each sample's reads and then averaged together. Bug fixes throughout. The math for the base quality and mapping quality rank sum tests is fixed. The annotations remain as ExperimentalAnnotations pending more investigation. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4934 348d0f76-0448-11de-a6fe-93d51630548a --- .../annotator/BaseQualityRankSumTest.java | 8 +- .../walkers/annotator/HaplotypeScore.java | 264 +++++++++--------- .../annotator/MappingQualityRankSumTest.java | 7 +- .../gatk/walkers/annotator/RankSumTest.java | 54 ++-- .../broadinstitute/sting/utils/MathUtils.java | 16 ++ .../sting/utils/WilcoxonRankSum.java | 12 +- .../sting/utils/genotype/Haplotype.java | 8 + .../sting/utils/pileup/PileupElement.java | 6 +- .../sting/utils/sam/AlignmentUtils.java | 145 +++++++++- .../VariantAnnotatorIntegrationTest.java | 10 +- .../UnifiedGenotyperIntegrationTest.java | 32 +-- 11 files changed, 362 insertions(+), 200 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index c76082e3e..3eb40685f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -9,8 +9,7 @@ import java.util.List; import java.util.Arrays; -public class BaseQualityRankSumTest /*extends RankSumTest*/ { - // todo -- seems math in this test is dubious, need to recheck and verify (p-values wildly divergent from R or MATLAB) +public class BaseQualityRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Phred-scaled p-value From Wilcoxon Rank Sum Test of Het Vs. Ref Base Qualities")); } @@ -21,10 +20,11 @@ public class BaseQualityRankSumTest /*extends RankSumTest*/ { if ( p.isDeletion() ) continue; - if ( p.getBase() == ref ) + if ( p.getBase() == ref ) { refQuals.add((int)p.getQual()); - else if ( (char)p.getBase() == alt ) + } else if ( (char)p.getBase() == alt ) { altQuals.add((int)p.getQual()); + } } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 36093f129..ad2bcdbbb 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; @@ -39,49 +40,51 @@ import org.broadinstitute.sting.utils.pileup.*; import java.util.*; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { private final static boolean DEBUG = false; - private final static int MIN_CONTEXT_WING_SIZE = 10; - private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 20; + private final static int MIN_CONTEXT_WING_SIZE = 20; + private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; public boolean useRead(PileupElement p) { - return ! ReadUtils.is454Read(p.getRead()); - // TODO -- warning, min. mapping quality fixed to constant 1 here for indels. How can I determine the min. mapping quality? - // if ( ReadUtils.is454Read(p.getRead()) ) // we're not a 454 read - // return false; - // else if ( p.getOffset() == -1 ) // indel - // return p.getRead().getMappingQuality() > 0; - // else if ( ! (p.getRead() instanceof GATKSAMRecord) ) // we're not a GATKSamRecord, so we can't filter - // return true; - // else - // return ((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset()); // we used the base to call the variant + //return true; // Use all reads + //return !ReadUtils.is454Read(p.getRead()); // Use all non-454 reads + return p.getOffset() == -1 || ((GATKSAMRecord)p.getRead()).isGoodBase(p.getOffset()); // Use all reads from the filtered context } public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( !vc.isBiallelic() || !vc.isSNP() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here return null; + + final AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values()); - AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values()); + final int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); + final int contextSize = contextWingSize * 2 + 1; - int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE); - int contextSize = contextWingSize * 2 + 1; + final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2; // Compute all haplotypes consistent with the current read pileup - List haplotypes = computeHaplotypes(context.getBasePileup(), contextSize); + final List haplotypes = computeHaplotypes(context.getBasePileup(), contextSize, locus); - // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - double score = 0.0; + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + if (haplotypes != null) { + final Set> genotypes = vc.getGenotypes().entrySet(); + for ( final Map.Entry genotype : genotypes ) { + final StratifiedAlignmentContext thisContext = stratifiedContexts.get(genotype.getKey()); + if ( thisContext != null ) { + double thisScore = scoreReadsAgainstHaplotypes(haplotypes, thisContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), contextSize, locus); + scoreRA.add(thisScore); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + } + } - if (haplotypes != null) - score = scoreReadsAgainstHaplotypes(haplotypes, context.getBasePileup(), contextSize); - - // return the score - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", score)); + // annotate the score in the info field + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean())); return map; } @@ -97,102 +100,89 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - private List computeHaplotypes(ReadBackedPileup pileup, int contextSize) { + private List computeHaplotypes(final ReadBackedPileup pileup, final int contextSize, final int locus) { // Compute all possible haplotypes consistent with current pileup - ArrayList haplotypeList = new ArrayList(); - PriorityQueue haplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); + final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); + final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); - for ( PileupElement p : pileup ) { + for ( final PileupElement p : pileup ) { if ( useRead(p) ) { - Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize); - haplotypeQueue.add(haplotypeFromRead); + final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus); + candidateHaplotypeQueue.add(haplotypeFromRead); } } // Now that priority queue has been built with all reads at context, we need to merge and find possible segregating haplotypes Haplotype elem; - while ((elem = haplotypeQueue.poll()) != null) { - //System.out.print("element: "+elem.toString()); - //System.out.format(" SumQual = %f\n", elem.getQualitySum()); + while ((elem = candidateHaplotypeQueue.poll()) != null) { boolean foundHaplotypeMatch = false; - //Haplotype[] remainingHaplotypes = haplotypeQueue.toArray(new Haplotype[haplotypeQueue.size()]); - for ( Haplotype haplotypeFromList : haplotypeList ) { - - Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); - //System.out.format("-Checking consensus for %s:", haplotypeFromList.toString()); + Haplotype lastCheckedHaplotype = null; + for ( final Haplotype haplotypeFromList : consensusHaplotypeQueue ) { + final Haplotype consensusHaplotype = getConsensusHaplotype(elem, haplotypeFromList); if (consensusHaplotype != null) { - //System.out.format("--Consensus haplotype = %s, qual = %f\n", consensusHaplotype.toString(), consensusHaplotype.getQualitySum()); foundHaplotypeMatch = true; if (consensusHaplotype.getQualitySum() > haplotypeFromList.getQualitySum()) { - haplotypeList.remove(haplotypeFromList); - haplotypeList.add(consensusHaplotype); + consensusHaplotypeQueue.remove(haplotypeFromList); + consensusHaplotypeQueue.add(consensusHaplotype); } break; } - /* else { - System.out.println("no consensus found"); + else { + lastCheckedHaplotype = haplotypeFromList; } - */ } - if (!foundHaplotypeMatch && haplotypeList.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) { - haplotypeList.add(elem); + if (!foundHaplotypeMatch && consensusHaplotypeQueue.size() < MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER) { + consensusHaplotypeQueue.add(elem); + } else if (!foundHaplotypeMatch && lastCheckedHaplotype != null && elem.getQualitySum() > lastCheckedHaplotype.getQualitySum() ) { + consensusHaplotypeQueue.remove(lastCheckedHaplotype); + consensusHaplotypeQueue.add(elem); } } - // Now retrieve two most popular haplotypes - // TODO - quick and dirty solution, could use better data structures to do this automatically - int bestIdx=0, secondBestIdx=0; - double bestIdxVal=-1.0, secondBestIdxVal = -1.0; - for (int k=0; k < haplotypeList.size(); k++) { - - double qualSum = haplotypeList.get(k).getQualitySum(); - if (qualSum >= bestIdxVal) { - secondBestIdx = bestIdx; - secondBestIdxVal = bestIdxVal; - bestIdx = k; - bestIdxVal = qualSum; - } - else if (qualSum >= secondBestIdxVal) { - // check if current is second best - secondBestIdx = k; - secondBestIdxVal = qualSum; - } - } - if (haplotypeList.size() > 0) { - Haplotype haplotypeR = haplotypeList.get(bestIdx); - Haplotype haplotypeA = haplotypeList.get(secondBestIdx); - //System.out.format("%d %d\n",bestIdx, secondBestIdx); - // Temp hack to match old implementation's scaling, TBD better behavior - - return Arrays.asList(new Haplotype(haplotypeR.getBasesAsBytes(), 60), new Haplotype(haplotypeA.getBasesAsBytes(), contextSize)); - } - else + // Now retrieve the two most popular haplotypes + if (consensusHaplotypeQueue.size() > 0) { + // Since the consensus haplotypes are in a quality-ordered priority queue, the two best haplotypes are just the first two in the queue + final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); + Haplotype haplotype2 = consensusHaplotypeQueue.poll(); + if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found + return Arrays.asList(new Haplotype(haplotype1.getBasesAsBytes(), 60), new Haplotype(haplotype2.getBasesAsBytes(), 20)); // These qual values aren't used for anything + } else { return null; + } } - private Haplotype getHaplotypeFromRead(PileupElement p, int contextSize) { - SAMRecord read = p.getRead(); + private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { + final SAMRecord read = p.getRead(); int readOffsetFromPileup = p.getOffset(); - int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; - byte[] haplotypeBases = new byte[contextSize]; + final byte[] haplotypeBases = new byte[contextSize]; for(int i=0; i < contextSize; i++) { haplotypeBases[i] = (byte)REGEXP_WILDCARD; } - double[] baseQualities = new double[contextSize]; - Arrays.fill(baseQualities,0.0); + final double[] baseQualities = new double[contextSize]; + Arrays.fill(baseQualities, 0.0); + + byte[] readBases = read.getReadBases(); + readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string + byte[] readQuals = read.getBaseQualities(); + readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string + + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); for (int i = 0; i < contextSize; i++ ) { - int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) + final int baseOffset = i + baseOffsetStart; + if ( baseOffset < 0 ) { continue; - if ( baseOffset >= readBases.length ) + } + if ( baseOffset >= readBases.length ) { break; - + } + if( readQuals[baseOffset] == PileupElement.DELETION_BASE) { readQuals[baseOffset] = PileupElement.DELETION_QUAL; } + if( readBases[baseOffset] == BaseUtils.N ) { readBases[baseOffset] = (byte)REGEXP_WILDCARD; readQuals[baseOffset] = (byte) 4; } // N's shouldn't be treated as distinct bases + if( ((double)readQuals[baseOffset]) < 4.0 ) { readQuals[baseOffset] = (byte) 4; } // quals less than 4 are used as codes and don't have actual probabilistic meaning behind them haplotypeBases[i] = readBases[baseOffset]; baseQualities[i] = (double)readQuals[baseOffset]; } @@ -200,19 +190,20 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return new Haplotype(haplotypeBases, baseQualities); } - private Haplotype getConsensusHaplotype(Haplotype haplotypeA, Haplotype haplotypeB) { + private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplotype haplotypeB) { final byte[] a = haplotypeA.getBasesAsBytes(); final byte[] b = haplotypeB.getBasesAsBytes(); - if (a.length != b.length) + if (a.length != b.length) { throw new ReviewedStingException("Haplotypes a and b must be of same length"); + } byte chA, chB; - byte wc = (byte)REGEXP_WILDCARD; + final byte wc = (byte)REGEXP_WILDCARD; final int length = a.length; - byte[] consensusChars = new byte[length]; - double[] consensusQuals = new double[length]; + final byte[] consensusChars = new byte[length]; + final double[] consensusQuals = new double[length]; final double[] qualsA = haplotypeA.getQuals(); final double[] qualsB = haplotypeB.getQuals(); @@ -243,45 +234,34 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { return new Haplotype(consensusChars, consensusQuals); } + // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes - private double scoreReadsAgainstHaplotypes(List haplotypes, ReadBackedPileup pileup, int contextSize) { + private double scoreReadsAgainstHaplotypes(final List haplotypes, final ReadBackedPileup pileup, final int contextSize, final int locus) { // if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0)); -// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1)); +// if ( DEBUG ) System.out.printf("HAP2: %s%n", haplotypes.get(1)); - double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()]; - int pileupOffset = 0; - for ( PileupElement p : pileup ) { - if ( useRead(p) ) { - SAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); - - if (ReadUtils.is454Read(read)) - continue; - - if ( DEBUG ) System.out.printf("--------------------------------------------- Read %s%n", read.getReadName()); - double m = 10000000; - for ( int i = 0; i < haplotypes.size(); i++ ) { - Haplotype haplotype = haplotypes.get(i); - int start = readOffsetFromPileup - (contextSize - 1)/2; - double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype); - haplotypeScores[pileupOffset][i] = score; - if ( DEBUG ) System.out.printf(" vs. haplotype %d = %f%n", i, score); - m = Math.min(score, m); - } - if ( DEBUG ) System.out.printf(" => best score was %f%n", m); + final ArrayList haplotypeScores = new ArrayList(); + for ( final PileupElement p : pileup ) { + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[haplotypes.size()]; + for ( int i = 0; i < haplotypes.size(); i++ ) { + final Haplotype haplotype = haplotypes.get(i); + final double score = scoreReadAgainstHaplotype(p, contextSize, haplotype, locus); + scores[i] = score; + if ( DEBUG ) { System.out.printf(" vs. haplotype %d = %f%n", i, score); } } - pileupOffset++; // todo -- remove me + haplotypeScores.add(scores); } double overallScore = 0.0; - for ( double[] readHaplotypeScores : haplotypeScores ) { + for ( final double[] readHaplotypeScores : haplotypeScores ) { overallScore += MathUtils.arrayMin(readHaplotypeScores); } return overallScore; } - private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Haplotype haplotype ) { + private double scoreReadAgainstHaplotype(final PileupElement p, final int contextSize, final Haplotype haplotype, final int locus ) { double expected = 0.0; double mismatches = 0.0; @@ -296,41 +276,49 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { // the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch. // so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n final byte[] haplotypeBases = haplotype.getBasesAsBytes(); - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); + final SAMRecord read = p.getRead(); + byte[] readBases = read.getReadBases(); + + readBases = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readBases); // Adjust the read bases based on the Cigar string + byte[] readQuals = read.getBaseQualities(); + readQuals = AlignmentUtils.readToAlignmentByteArray(p.getRead().getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string + int readOffsetFromPileup = p.getOffset(); + readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), readOffsetFromPileup, p.getRead().getAlignmentStart(), locus); + final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2; + for ( int i = 0; i < contextSize; i++ ) { - int baseOffset = i + baseOffsetStart; - if ( baseOffset < 0 ) + final int baseOffset = i + baseOffsetStart; + if ( baseOffset < 0 ) { continue; - if ( baseOffset >= readBases.length ) + } + if ( baseOffset >= readBases.length ) { break; + } - byte haplotypeBase = haplotypeBases[i]; - byte readBase = readBases[baseOffset]; + final byte haplotypeBase = haplotypeBases[i]; + final byte readBase = readBases[baseOffset]; - boolean matched = readBase == haplotypeBase; - double e = QualityUtils.qualToErrorProb(readQuals[baseOffset]); + + final boolean matched = (readBase == haplotypeBase); // Purposefully counting wildcards in the chosen haplotype as a mismatch + byte qual = readQuals[baseOffset]; + if( qual == PileupElement.DELETION_BASE ) { qual = PileupElement.DELETION_QUAL; } + if( ((double) qual) < 4.0 ) { qual = (byte) 4; } // quals less than 4 are used as codes and don't have actual probabilistic meaning behind them + final double e = QualityUtils.qualToErrorProb(qual); expected += e; - mismatches += matched ? e : 1 - e / 3; + mismatches += matched ? e : 1.0 - e / 3.0; // a more sophisticated calculation would include the reference quality, but it's nice to actually penalize // the mismatching of poorly determined regions of the consensus - if ( DEBUG ) System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n", - read.getReadName(), (char)haplotypeBase, (char)readBase, e, readQuals[baseOffset], expected, mismatches); +// if ( DEBUG ) { +// System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n", +// read.getReadName(), (char)haplotypeBase, (char)readBase, e, readQuals[baseOffset], expected, mismatches); +// } } return mismatches - expected; } - - private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length]; - static { - for ( int i = 0; i < BaseUtils.Base.values().length; i++ ) - FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length); - } - - public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with two (and only two) segregating haplotypes")); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 1c9c9312e..7036d15c6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.Arrays; -public class MappingQualityRankSumTest /*extends RankSumTest*/ { +public class MappingQualityRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("MQRankSum"); } @@ -21,10 +21,11 @@ public class MappingQualityRankSumTest /*extends RankSumTest*/ { if ( p.isDeletion() ) continue; - if ( p.getBase() == ref ) + if ( p.getBase() == ref ) { refQuals.add(p.getMappingQual()); - else if ( (char)p.getBase() == alt ) + } else if ( (char)p.getBase() == alt ) { altQuals.add(p.getMappingQual()); + } } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 68696fe04..a80ebc3f8 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -15,9 +15,9 @@ import java.util.Map; import java.util.HashMap; -public abstract class RankSumTest implements InfoFieldAnnotation, WorkInProgressAnnotation { +public abstract class RankSumTest implements InfoFieldAnnotation, ExperimentalAnnotation { private final static boolean DEBUG = false; - private static final double minPValue = 1e-10; + private static final double minPValue = 1e-20; public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -30,45 +30,49 @@ public abstract class RankSumTest implements InfoFieldAnnotation, WorkInProgress if ( genotypes == null || genotypes.size() == 0 ) return null; - ArrayList refQuals = new ArrayList(); - ArrayList altQuals = new ArrayList(); + final ArrayList refQuals = new ArrayList(); + final ArrayList altQuals = new ArrayList(); - for ( Map.Entry genotype : genotypes.entrySet() ) { - // we care only about het calls - if ( genotype.getValue().isHet() ) { - StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey()); + for ( final Map.Entry genotype : genotypes.entrySet() ) { + if ( !genotype.getValue().isHomRef() ) { + final StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey()); if ( context == null ) continue; fillQualsFromPileup(ref.getBase(), vc.getAlternateAllele(0).toString().charAt(0), context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(), refQuals, altQuals); } } - - WilcoxonRankSum wilcoxon = new WilcoxonRankSum(); - for ( Integer qual : altQuals ) + final WilcoxonRankSum wilcoxon = new WilcoxonRankSum(); + for ( final Integer qual : altQuals ) { wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET1); - for ( Integer qual : refQuals ) + } + for ( final Integer qual : refQuals ) { wilcoxon.addObservation((double)qual, WilcoxonRankSum.WILCOXON_SET.SET2); - - // for R debugging - if ( DEBUG ) { - wilcoxon.DEBUG = DEBUG; - System.out.printf("%s%n", ref.getLocus()); - System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals)); - System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals)); } + // for R debugging + //if ( DEBUG ) { + // wilcoxon.DEBUG = DEBUG; + // System.out.printf("%s%n", ref.getLocus()); + // System.out.printf("alt <- c(%s)%n", Utils.join(",", altQuals)); + // System.out.printf("ref <- c(%s)%n", Utils.join(",", refQuals)); + //} + // we are testing these set1 (the alt bases) have lower quality scores than set2 (the ref bases) - double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SMALLER_SET_LT); - if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 ) - return null; + double pvalue = wilcoxon.getPValue(WilcoxonRankSum.WILCOXON_H0.SET1_LT_SET2); + //System.out.println("p = " + pvalue); + //System.out.println(); + if ( MathUtils.compareDoubles(pvalue, -1.0) == 0 ) { + pvalue = 1.0; + } // deal with precision issues - if ( pvalue < minPValue ) + if ( pvalue < minPValue ) { pvalue = minPValue; + } - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue))); + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); return map; } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8cfbf6dd4..79d457c82 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -325,6 +325,22 @@ public class MathUtils { return Math.sqrt(rms); } + /** + * calculate the Root Mean Square of an array of doubles + * @param x a double[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(Double[] x) { + if ( x.length == 0 ) + return 0.0; + + double rms = 0.0; + for (Double i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + /** * normalizes the log10-based array * diff --git a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java index b9ea9d0bc..bb925104c 100755 --- a/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java +++ b/java/src/org/broadinstitute/sting/utils/WilcoxonRankSum.java @@ -92,8 +92,8 @@ public class WilcoxonRankSum { // just used to convert a z-score into a cumulative probability public boolean DEBUG = false; - private static final double NORMAL_MEAN = 0; - private static final double NORMAL_SD = 1; + private static final double NORMAL_MEAN = 0.0; + private static final double NORMAL_SD = 3.0; // Need to spread out the standard normal in order to provide better resolution private static final Normal NORMAL = new Normal(NORMAL_MEAN, NORMAL_SD, null); // The minimum length for both data series (individually) in order to use a normal distribution @@ -151,11 +151,11 @@ public class WilcoxonRankSum { double pvalue; // if we don't have enough data points, quit - if ( n1 < minimumNormalN || n2 < minimumNormalN ) { - pvalue = exactCalculation(h0, n1, n2, U1, U2); - } else { +// if ( n1 < minimumNormalN || n2 < minimumNormalN ) { +// pvalue = exactCalculation(h0, n1, n2, U1, U2); +// } else { pvalue = normalApproximation(h0, n1, n2, U1, U2); - } +// } if ( DEBUG && (n1 < minimumNormalN || n2 < minimumNormalN) ) { //for (int i = 0; i < observations.size(); i++) diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 675cef5ff..6d8bf11de 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -81,6 +81,14 @@ public class Haplotype { return s; } + public String toString() { + String returnString = ""; + for(int iii = 0; iii < bases.length; iii++) { + returnString += (char) bases[iii]; + } + return returnString; + } + public double[] getQuals() { return quals; } diff --git a/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 5d563c84c..f8b1d9dfa 100755 --- a/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -18,7 +18,11 @@ import java.util.Arrays; */ public class PileupElement { public static final byte DELETION_BASE = BaseUtils.D; - public static final byte DELETION_QUAL = 0; + public static final byte INSERTION_BASE_A = 87; + public static final byte INSERTION_BASE_C = 88; + public static final byte INSERTION_BASE_T = 89; + public static final byte INSERTION_BASE_G = 90; + public static final byte DELETION_QUAL = 18; protected SAMRecord read; protected int offset; diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 92c59ca2f..63ee5882e 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -422,7 +422,7 @@ public class AlignmentUtils { switch( ce.getOperator() ) { case I: case S: - for ( int jjj = 0 ; jjj < elementLength; jjj++ ) { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { alignment[alignPos++] = '+'; } break; @@ -431,7 +431,7 @@ public class AlignmentUtils { refPos++; break; case M: - for ( int jjj = 0 ; jjj < elementLength; jjj++ ) { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { alignment[alignPos] = ref[refPos]; alignPos++; refPos++; @@ -447,6 +447,147 @@ public class AlignmentUtils { return alignment; } + public static int calcAlignmentByteArrayOffset( final Cigar cigar, int pileupOffset, final int alignmentStart, final int refLocus ) { + + boolean atDeletion = false; + if(pileupOffset == -1) { + atDeletion = true; + pileupOffset = refLocus - alignmentStart; + final CigarElement ce = cigar.getCigarElement(0); + if( ce.getOperator() == CigarOperator.S ) { + pileupOffset += ce.getLength(); + } + } + int pos = 0; + int alignmentPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if( pos == pileupOffset ) { + return alignmentPos; + } + pos++; + } + break; + case D: + case N: + if(!atDeletion) { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignmentPos++; + } + } else { + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if( pos == pileupOffset ) { + return alignmentPos; + } + pos++; + alignmentPos++; + } + + } + break; + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + if( pos == pileupOffset ) { + return alignmentPos; + } + pos++; + alignmentPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignmentPos; + } + + public static byte[] readToAlignmentByteArray( final Cigar cigar, final byte[] read ) { + + int alignmentLength = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + case S: + break; + case D: + case N: + alignmentLength += elementLength; + break; + case M: + alignmentLength += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + + final byte[] alignment = new byte[alignmentLength]; + int alignPos = 0; + int readPos = 0; + for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { + + final CigarElement ce = cigar.getCigarElement(iii); + final int elementLength = ce.getLength(); + + switch( ce.getOperator() ) { + case I: + if( alignPos > 0 ) { + if(alignment[alignPos-1] == (byte) 'A') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_A; + } else if(alignment[alignPos-1] == (byte) 'C') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_C; + } else if(alignment[alignPos-1] == (byte) 'T') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_T; + } else if(alignment[alignPos-1] == (byte) 'G') { + alignment[alignPos-1] = PileupElement.INSERTION_BASE_G; + } + } + case S: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + readPos++; + } + break; + case D: + case N: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = PileupElement.DELETION_BASE; + alignPos++; + } + break; + case M: + for ( int jjj = 0; jjj < elementLength; jjj++ ) { + alignment[alignPos] = read[readPos]; + alignPos++; + readPos++; + } + break; + case H: + case P: + break; + default: + throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); + } + } + return alignment; + } + /** * Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format * specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 72cb124ea..6545128e0 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("7ffa7d0ae9e4265eced1f793be9eba11")); + Arrays.asList("98861116ac1cdaf152adead3183664d8")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -39,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("063235b9a98137902a77b6efbaffc618")); + Arrays.asList("53d083505fc82fb388566d3856ed20ba")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a20994036d2deeeff377a89d9b93a0c6")); + Arrays.asList("c5807c5794c3a847d78e3800553d989a")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -71,7 +71,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("3677fd14ef2975afdca559a871f57a33")); + Arrays.asList("a9b2a7adee7ba8b0f5d7ff8d92e6dfbd")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -79,7 +79,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("d4d33310e0280a9329efb8c6b39f7661")); + Arrays.asList("e3c839910aa82061742e33196b112cb0")); executeTest("test overwriting header", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9380bb06f..dba3eb8fe 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -25,7 +25,7 @@ public class public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e90af2265bdbfc1c336c7e1484b86a4a")); + Arrays.asList("ae901e034b00aef16d36295821b3ea63")); executeTest("testMultiSamplePilot1", spec); } @@ -33,7 +33,7 @@ public class public void testMultiSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList("9ef1405f3ddf4a9894d12718cc6041a1")); + Arrays.asList("2ad026dee3fe592c124eb8724a843a5e")); executeTest("testMultiSamplePilot2", spec); } @@ -41,7 +41,7 @@ public class public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("88a095d59e3210955dd066e54cfff6cd")); + Arrays.asList("bc573090407ae9ec4401757eaa03de20")); executeTest("testSingleSamplePilot2", spec); } @@ -51,7 +51,7 @@ public class // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "f213174bc3f6890dbe72628763be75d0"; + private final static String COMPRESSED_OUTPUT_MD5 = "1889ecc5aa1b23b2e77b1bd192577f1a"; @Test public void testCompressedOutput() { @@ -78,7 +78,7 @@ public class @Test public void testParallelization() { - String md5 = "4c88572ec014cd0b256b76cb5fac41df"; + String md5 = "d19745ab31f903de8d5a8e853b4e52dd"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -105,12 +105,12 @@ public class @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "9d24c57250ec66905a157975c27f7094" ); - e.put( "-all_bases", "6bd860e4de6a4f013693a49556ccfd02" ); - e.put( "--min_base_quality_score 26", "94de36ab7021e767f14903b7fd0cf80e" ); - e.put( "--min_mapping_quality_score 26", "a86e9cdc629f0957658f8d570014f45b" ); - e.put( "--max_mismatches_in_40bp_window 5", "4cf60eeff7f25d8e778c72deb7e14cc2" ); - e.put( "--p_nonref_model GRID_SEARCH", "eda1afbdb42c9c5d6fc07a321020071a" ); + e.put( "-genotype", "37d3954e19309a24c386758afad93252" ); + e.put( "-all_bases", "04568093c5dc70fa7965b4ab15fd0f7e" ); + e.put( "--min_base_quality_score 26", "5d1886a9637183707645bc2dc6bf8282" ); + e.put( "--min_mapping_quality_score 26", "78423524cf56cce1d0847847d542459f" ); + e.put( "--max_mismatches_in_40bp_window 5", "2963c771aafe84b62082f475d20bad5e" ); + e.put( "--p_nonref_model GRID_SEARCH", "c254a4e593b4ffb112299be874503ce0" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -124,12 +124,12 @@ public class public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("eda1afbdb42c9c5d6fc07a321020071a")); + Arrays.asList("c254a4e593b4ffb112299be874503ce0")); executeTest("testConfidence1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("8daa14278976555e64c582c4e44b9b8e")); + Arrays.asList("d2323a0234d27257393f4931fca70dbc")); executeTest("testConfidence2", spec2); } @@ -141,8 +141,8 @@ public class @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "3679786112b414546a464c94c900174e" ); - e.put( 1.0 / 1850, "efa1cb09fa72dd4bd6dbdf6c0fa0f038" ); + e.put( 0.01, "96e85b26cf5f0a523b4b0886dbb902b1" ); + e.put( 1.0 / 1850, "5ffecb68b52169ded04312aa5dcdc137" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -165,7 +165,7 @@ public class " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("037e7c0d56e88b4d85f326bf27ad9f1c")); + Arrays.asList("e65c95a9d2a0995078d3e6835cf4ee61")); executeTest(String.format("testMultiTechnologies"), spec); }