diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java index e8e405afc..afa03c600 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; +import java.util.Collection; /** * Useful class for storing different AlignmentContexts @@ -53,9 +54,17 @@ public class StratifiedAlignmentContext { private GenomeLoc loc; private AlignmentContext[] contexts = new AlignmentContext[StratifiedContextType.values().length]; + + // todo -- why are you storing reads separately each time? There's a ReadBackedPileup object that's supposed to handle this private ArrayList[] reads = new ArrayList[StratifiedContextType.values().length]; private ArrayList[] offsets = new ArrayList[StratifiedContextType.values().length]; + // + // accessors + // + public GenomeLoc getLocation() { return loc; } + public ArrayList getReads(StratifiedContextType type) { return reads[type.ordinal()]; } + public ArrayList getOffsets(StratifiedContextType type) { return offsets[type.ordinal()]; } public StratifiedAlignmentContext(GenomeLoc loc) { this.loc = loc; @@ -65,10 +74,10 @@ public class StratifiedAlignmentContext { } } - public AlignmentContext getContext(StratifiedContextType context) { - int index = context.ordinal(); + public AlignmentContext getContext(StratifiedContextType type) { + int index = type.ordinal(); if ( contexts[index] == null ) - contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, reads[index], offsets[index])); + contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, getReads(type), getOffsets(type))); return contexts[index]; } @@ -214,4 +223,20 @@ public class StratifiedAlignmentContext { return contexts; } + + public static AlignmentContext joinContexts(Collection contexts, StratifiedContextType type) { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + GenomeLoc loc = null; + for ( StratifiedAlignmentContext context : contexts ) { + loc = context.getLocation(); + reads.addAll(context.getReads(type)); + offsets.addAll(context.getOffsets(type)); + } + + if ( loc == null ) + throw new StingException("BUG: joinContexts requires at least one context to join"); + return new AlignmentContext(loc, new ReadBackedPileup(loc, reads, offsets)); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java new file mode 100755 index 000000000..aaee42402 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -0,0 +1,219 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement; +import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; + +import java.util.*; + +import net.sf.samtools.SAMRecord; + + +public class HaplotypeScore implements InfoFieldAnnotation, WorkInProgressAnnotation { + + public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( !vc.isBiallelic() || !vc.isSNP() ) + return null; + + AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values(), StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + + int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, 10); + int contextSize = contextWingSize * 2 + 1; + + int refMiddle = (int)(ref.getWindow().size() - 1) / 2; + int refStart = refMiddle - contextWingSize; + int refStop = refMiddle + contextWingSize + 1; + String refString = new String(ref.getBases()).substring(refStart, refStop); + Consensus refConsensus = new Consensus(refString.getBytes(), 60); + + ReadBackedPileup altPileup = getPileupOfAllele(vc.getAlternateAllele(0), context.getBasePileup()); + Consensus altConsensus = new Consensus(altPileup, contextSize); + + //System.exit(1); + // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes + double score = scoreReadsAgainstHaplotypes(Arrays.asList(refConsensus, altConsensus), context.getBasePileup(), contextSize); + + // return the score + Map map = new HashMap(); + map.put(getKeyName(), String.format("%.2f", score)); + return map; + } + + // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes + private double scoreReadsAgainstHaplotypes(List haplotypes, ReadBackedPileup pileup, int contextSize) { + System.out.printf("REF: %s%n", haplotypes.get(0)); + System.out.printf("ALT: %s%n", haplotypes.get(1)); + + double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()]; + for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) { + SAMRecord read = p.getRead(); + int readOffsetFromPileup = p.getOffset(); + + System.out.printf("--------------------------------------------- Read %s%n", read.getReadName()); + double m = 10000000; + for ( int i = 0; i < haplotypes.size(); i++ ) { + Consensus haplotype = haplotypes.get(i); + int start = readOffsetFromPileup - (contextSize - 1)/2; + double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype); + haplotypeScores[p.getPileupOffset()][i] = score; + System.out.printf(" vs. haplotype %d = %f%n", i, score); + m = Math.min(score, m); + } + System.out.printf(" => best score was %f%n", m); + } + + double overallScore = 0.0; + for ( double[] readHaplotypeScores : haplotypeScores ) { + overallScore += MathUtils.arrayMin(readHaplotypeScores); + } + + return overallScore; + } + + private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Consensus haplotype ) { + double log10LExpected = 0.0; + double log10LMismatches = 0.0; + + for ( int i = 0; i < contextSize; i++ ) { + int baseOffset = i + baseOffsetStart; + if ( baseOffset < 0 ) + continue; + if ( baseOffset >= read.getReadLength() ) + break; + + byte haplotypeBase = haplotype.bases[i]; + byte readBase = read.getReadBases()[baseOffset]; + if ( ! BaseUtils.basesAreEqual(readBase, haplotypeBase ) ) { + log10LMismatches += read.getBaseQualities()[baseOffset]; + } + + //System.out.printf("Read %s: scoring %c vs. %c => %f%n", read.getReadName(), (char)haplotypeBase, (char)readBase, log10LMismatches); + } + + return log10LMismatches; // - log10LExpected; + } + + + private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length]; + static { + for ( int i = 0; i < BaseUtils.Base.values().length; i++ ) + FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length); + } + + private class Consensus { + byte[] bases = null; + byte[] quals = null; + + /** + * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual + * + * @param bases + * @param qual + */ + Consensus(byte[] bases, int qual) { + this.bases = bases; + quals = new byte[bases.length]; + Arrays.fill(quals, (byte)qual); + } + + Consensus(ReadBackedPileup pileup, int contextSize ) { + this.bases = new byte[contextSize]; + this.quals = new byte[contextSize]; + calculateConsensusOverWindow(pileup, contextSize, (contextSize - 1) / 2); + } + + private void calculateConsensusOverWindow(ReadBackedPileup pileup, int contextSize, int pileupOffset) { + // for each context position + for ( int i = 0; i < contextSize; i++ ) { + int offsetFromPileup = i - pileupOffset; + ReadBackedPileup offsetPileup = pileupAtOffset(pileup, offsetFromPileup); + System.out.printf("pileup is %s%n", offsetPileup); + BaseQual bq = calcConsensusAtLocus(offsetPileup, FLAT_BASE_PRIORS); + this.bases[i] = bq.getBase(); + this.quals[i] = bq.getQual(); + System.out.printf(" At %d: offset %d bq = %c / %d%n", i, offsetFromPileup, (char)bq.getBase(), bq.getQual()); + } + } + + private BaseQual calcConsensusAtLocus( ReadBackedPileup pileup, double[] log10priors ) { + double[] log10BaseLikelihoods = new double[BaseUtils.Base.values().length]; + + // loop over a, c, g, t and determine the most likely hypothesis + for ( BaseUtils.Base base : BaseUtils.Base.values() ) { + double log10L = log10priors[base.getIndex()]; + + for ( PileupElement p : pileup ) { + double baseP = QualityUtils.qualToProb(p.getQual()); + double L = base.sameBase(p.getBase()) ? baseP : 1 - baseP; + log10L += Math.log10(L); + } + + log10BaseLikelihoods[base.getIndex()] = log10L; + } + + double[] posteriors = MathUtils.normalizeFromLog10(log10BaseLikelihoods, false); + int mostLikelyIndex = MathUtils.maxElementIndex(posteriors); + byte mostLikelyBase = BaseUtils.Base.values()[mostLikelyIndex].getBase(); // get the most likely option + double MAX_CONSENSUS_QUALITY = 0.000001; + byte qual = QualityUtils.probToQual(posteriors[mostLikelyIndex],MAX_CONSENSUS_QUALITY); // call posterior calculator here over L over bases + return new BaseQual(mostLikelyBase, qual); + } + + public String toString() { return new String(this.bases); } + } + + private class BaseQual extends Pair { + public BaseQual(byte base, byte qual) { + super(base, qual); + } + + public byte getBase() { return getFirst(); } + public byte getQual() { return getSecond(); } + } + + private static ReadBackedPileup pileupAtOffset(ReadBackedPileup pileup, int offsetFromPileup) { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + // go through the pileup, read by read, collecting up the context at offset + for ( PileupElement p : pileup ) { + // not safe -- doesn't work for indel-containing reads! + + // whole algorithm should be restructured to handle other reads in the window or use LocusIteratorByState + SAMRecord read = p.getRead(); + int readOffsetInPileup = p.getOffset(); + int neededReadOffset = readOffsetInPileup + offsetFromPileup; + if ( neededReadOffset >= 0 && neededReadOffset < read.getReadLength() ) { + reads.add(p.getRead()); + offsets.add(neededReadOffset); + } + } + + return new ReadBackedPileup(pileup.getLocation(), reads, offsets); + } + + private static ReadBackedPileup getPileupOfAllele( Allele allele, ReadBackedPileup pileup ) { + ArrayList filteredPileup = new ArrayList(); + byte alleleBase = allele.getBases()[0]; // assumes SNP + + for ( PileupElement p : pileup ) { + if ( p.getBase() == alleleBase ) { + filteredPileup.add(p); + } + } + + return new ReadBackedPileup(pileup.getLocation(), filteredPileup); + } + + public String getKeyName() { return "HaplotypeScore"; } + public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("HaplotypeScore", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Consistency of the site with two (and only two) segregating haplotypes"); } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 9b0169a27..1645ed20a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -51,26 +51,26 @@ public class QualByDepth implements InfoFieldAnnotation, StandardAnnotation { return depth; } - private double genotypeQualByDepth(final Map genotypes, Map stratifiedContexts) { - ArrayList qualsByDepth = new ArrayList(); - for ( Map.Entry genotype : genotypes.entrySet() ) { - - // we care only about variant calls - if ( genotype.getValue().isHomRef() ) - continue; - - StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey()); - if ( context == null ) - continue; - - qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); - } - - double mean = 0.0; - for ( Double value : qualsByDepth ) - mean += value; - mean /= qualsByDepth.size(); - - return mean; - } +// private double genotypeQualByDepth(final Map genotypes, Map stratifiedContexts) { +// ArrayList qualsByDepth = new ArrayList(); +// for ( Map.Entry genotype : genotypes.entrySet() ) { +// +// // we care only about variant calls +// if ( genotype.getValue().isHomRef() ) +// continue; +// +// StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey()); +// if ( context == null ) +// continue; +// +// qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); +// } +// +// double mean = 0.0; +// for ( Double value : qualsByDepth ) +// mean += value; +// mean /= qualsByDepth.size(); +// +// return mean; +// } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 6b9ff4284..e106859b5 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -10,6 +10,30 @@ import java.util.Random; public class BaseUtils { public final static char[] BASES = { 'A', 'C', 'G', 'T' }; public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' }; + + public enum Base { + A ( 'A', 0 ), + C ( 'C', 1 ), + G ( 'G', 2 ), + T ( 'T', 3 ); + + byte b; + int index; + private Base(char base, int index) { + this.b = (byte)base; + this.index = index; + } + + public byte getBase() { return b; } + public char getBaseAsChar() { return (char)b; } + public int getIndex() { return index; } + + public boolean sameBase(byte o) { return b == o; } + public boolean sameBase(char o) { return b == (byte)o; } + public boolean sameBase(int i) { return index == i; } + } + + // todo -- fix me (enums?) public static final byte DELETION_INDEX = 4; public static final byte NO_CALL_INDEX = 5; // (this is 'N') diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index d93ec1137..9c3b70daf 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -308,4 +308,36 @@ public class MathUtils { public static double[] normalizeFromLog10(double[] array) { return normalizeFromLog10(array, false); } + + public static int maxElementIndex(double[] array) { + if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = -1; + for ( int i = 0; i < array.length; i++ ) { + if ( maxI == -1 || array[i] > array[maxI] ) + maxI = i; + } + + return maxI; + } + + public static double arrayMax(double[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMin(double[] array) { + return array[minElementIndex(array)]; + } + + public static int minElementIndex(double[] array) { + if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + + int minI = -1; + for ( int i = 0; i < array.length; i++ ) { + if ( minI == -1 || array[i] < array[minI] ) + minI = i; + } + + return minI; + } } diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 6b99ceb3f..627727389 100755 --- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -507,4 +507,16 @@ public class ReadBackedPileup implements Iterable { private String getQualsString() { return quals2String(getQuals()); } + + public String toString() { + StringBuilder s = new StringBuilder(); + s.append(getLocation()); + s.append(": "); + + for ( PileupElement p : this ) { + s.append((char)p.getBase()); + } + + return s.toString(); + } }