Minor useful changes to BaseUtils and MathUtils to support a new haplotype score annotation that determines to the two most likely haplotypes over an interval and scores variants by their consistency with a diploid model. Appears to be useful.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3085 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-28 21:45:22 +00:00
parent e9e53f68ab
commit b8ab74a6dc
6 changed files with 337 additions and 25 deletions

View File

@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Collection;
/**
* Useful class for storing different AlignmentContexts
@ -53,9 +54,17 @@ public class StratifiedAlignmentContext {
private GenomeLoc loc;
private AlignmentContext[] contexts = new AlignmentContext[StratifiedContextType.values().length];
// todo -- why are you storing reads separately each time? There's a ReadBackedPileup object that's supposed to handle this
private ArrayList<SAMRecord>[] reads = new ArrayList[StratifiedContextType.values().length];
private ArrayList<Integer>[] offsets = new ArrayList[StratifiedContextType.values().length];
//
// accessors
//
public GenomeLoc getLocation() { return loc; }
public ArrayList<SAMRecord> getReads(StratifiedContextType type) { return reads[type.ordinal()]; }
public ArrayList<Integer> getOffsets(StratifiedContextType type) { return offsets[type.ordinal()]; }
public StratifiedAlignmentContext(GenomeLoc loc) {
this.loc = loc;
@ -65,10 +74,10 @@ public class StratifiedAlignmentContext {
}
}
public AlignmentContext getContext(StratifiedContextType context) {
int index = context.ordinal();
public AlignmentContext getContext(StratifiedContextType type) {
int index = type.ordinal();
if ( contexts[index] == null )
contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, reads[index], offsets[index]));
contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, getReads(type), getOffsets(type)));
return contexts[index];
}
@ -214,4 +223,20 @@ public class StratifiedAlignmentContext {
return contexts;
}
public static AlignmentContext joinContexts(Collection<StratifiedAlignmentContext> contexts, StratifiedContextType type) {
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
GenomeLoc loc = null;
for ( StratifiedAlignmentContext context : contexts ) {
loc = context.getLocation();
reads.addAll(context.getReads(type));
offsets.addAll(context.getOffsets(type));
}
if ( loc == null )
throw new StingException("BUG: joinContexts requires at least one context to join");
return new AlignmentContext(loc, new ReadBackedPileup(loc, reads, offsets));
}
}

View File

@ -0,0 +1,219 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.*;
import net.sf.samtools.SAMRecord;
public class HaplotypeScore implements InfoFieldAnnotation, WorkInProgressAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
if ( !vc.isBiallelic() || !vc.isSNP() )
return null;
AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values(), StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, 10);
int contextSize = contextWingSize * 2 + 1;
int refMiddle = (int)(ref.getWindow().size() - 1) / 2;
int refStart = refMiddle - contextWingSize;
int refStop = refMiddle + contextWingSize + 1;
String refString = new String(ref.getBases()).substring(refStart, refStop);
Consensus refConsensus = new Consensus(refString.getBytes(), 60);
ReadBackedPileup altPileup = getPileupOfAllele(vc.getAlternateAllele(0), context.getBasePileup());
Consensus altConsensus = new Consensus(altPileup, contextSize);
//System.exit(1);
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
double score = scoreReadsAgainstHaplotypes(Arrays.asList(refConsensus, altConsensus), context.getBasePileup(), contextSize);
// return the score
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyName(), String.format("%.2f", score));
return map;
}
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
private double scoreReadsAgainstHaplotypes(List<Consensus> haplotypes, ReadBackedPileup pileup, int contextSize) {
System.out.printf("REF: %s%n", haplotypes.get(0));
System.out.printf("ALT: %s%n", haplotypes.get(1));
double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()];
for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) {
SAMRecord read = p.getRead();
int readOffsetFromPileup = p.getOffset();
System.out.printf("--------------------------------------------- Read %s%n", read.getReadName());
double m = 10000000;
for ( int i = 0; i < haplotypes.size(); i++ ) {
Consensus haplotype = haplotypes.get(i);
int start = readOffsetFromPileup - (contextSize - 1)/2;
double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype);
haplotypeScores[p.getPileupOffset()][i] = score;
System.out.printf(" vs. haplotype %d = %f%n", i, score);
m = Math.min(score, m);
}
System.out.printf(" => best score was %f%n", m);
}
double overallScore = 0.0;
for ( double[] readHaplotypeScores : haplotypeScores ) {
overallScore += MathUtils.arrayMin(readHaplotypeScores);
}
return overallScore;
}
private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Consensus haplotype ) {
double log10LExpected = 0.0;
double log10LMismatches = 0.0;
for ( int i = 0; i < contextSize; i++ ) {
int baseOffset = i + baseOffsetStart;
if ( baseOffset < 0 )
continue;
if ( baseOffset >= read.getReadLength() )
break;
byte haplotypeBase = haplotype.bases[i];
byte readBase = read.getReadBases()[baseOffset];
if ( ! BaseUtils.basesAreEqual(readBase, haplotypeBase ) ) {
log10LMismatches += read.getBaseQualities()[baseOffset];
}
//System.out.printf("Read %s: scoring %c vs. %c => %f%n", read.getReadName(), (char)haplotypeBase, (char)readBase, log10LMismatches);
}
return log10LMismatches; // - log10LExpected;
}
private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length];
static {
for ( int i = 0; i < BaseUtils.Base.values().length; i++ )
FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length);
}
private class Consensus {
byte[] bases = null;
byte[] quals = null;
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
*
* @param bases
* @param qual
*/
Consensus(byte[] bases, int qual) {
this.bases = bases;
quals = new byte[bases.length];
Arrays.fill(quals, (byte)qual);
}
Consensus(ReadBackedPileup pileup, int contextSize ) {
this.bases = new byte[contextSize];
this.quals = new byte[contextSize];
calculateConsensusOverWindow(pileup, contextSize, (contextSize - 1) / 2);
}
private void calculateConsensusOverWindow(ReadBackedPileup pileup, int contextSize, int pileupOffset) {
// for each context position
for ( int i = 0; i < contextSize; i++ ) {
int offsetFromPileup = i - pileupOffset;
ReadBackedPileup offsetPileup = pileupAtOffset(pileup, offsetFromPileup);
System.out.printf("pileup is %s%n", offsetPileup);
BaseQual bq = calcConsensusAtLocus(offsetPileup, FLAT_BASE_PRIORS);
this.bases[i] = bq.getBase();
this.quals[i] = bq.getQual();
System.out.printf(" At %d: offset %d bq = %c / %d%n", i, offsetFromPileup, (char)bq.getBase(), bq.getQual());
}
}
private BaseQual calcConsensusAtLocus( ReadBackedPileup pileup, double[] log10priors ) {
double[] log10BaseLikelihoods = new double[BaseUtils.Base.values().length];
// loop over a, c, g, t and determine the most likely hypothesis
for ( BaseUtils.Base base : BaseUtils.Base.values() ) {
double log10L = log10priors[base.getIndex()];
for ( PileupElement p : pileup ) {
double baseP = QualityUtils.qualToProb(p.getQual());
double L = base.sameBase(p.getBase()) ? baseP : 1 - baseP;
log10L += Math.log10(L);
}
log10BaseLikelihoods[base.getIndex()] = log10L;
}
double[] posteriors = MathUtils.normalizeFromLog10(log10BaseLikelihoods, false);
int mostLikelyIndex = MathUtils.maxElementIndex(posteriors);
byte mostLikelyBase = BaseUtils.Base.values()[mostLikelyIndex].getBase(); // get the most likely option
double MAX_CONSENSUS_QUALITY = 0.000001;
byte qual = QualityUtils.probToQual(posteriors[mostLikelyIndex],MAX_CONSENSUS_QUALITY); // call posterior calculator here over L over bases
return new BaseQual(mostLikelyBase, qual);
}
public String toString() { return new String(this.bases); }
}
private class BaseQual extends Pair<Byte, Byte> {
public BaseQual(byte base, byte qual) {
super(base, qual);
}
public byte getBase() { return getFirst(); }
public byte getQual() { return getSecond(); }
}
private static ReadBackedPileup pileupAtOffset(ReadBackedPileup pileup, int offsetFromPileup) {
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
// go through the pileup, read by read, collecting up the context at offset
for ( PileupElement p : pileup ) {
// not safe -- doesn't work for indel-containing reads!
// whole algorithm should be restructured to handle other reads in the window or use LocusIteratorByState
SAMRecord read = p.getRead();
int readOffsetInPileup = p.getOffset();
int neededReadOffset = readOffsetInPileup + offsetFromPileup;
if ( neededReadOffset >= 0 && neededReadOffset < read.getReadLength() ) {
reads.add(p.getRead());
offsets.add(neededReadOffset);
}
}
return new ReadBackedPileup(pileup.getLocation(), reads, offsets);
}
private static ReadBackedPileup getPileupOfAllele( Allele allele, ReadBackedPileup pileup ) {
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
byte alleleBase = allele.getBases()[0]; // assumes SNP
for ( PileupElement p : pileup ) {
if ( p.getBase() == alleleBase ) {
filteredPileup.add(p);
}
}
return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
}
public String getKeyName() { return "HaplotypeScore"; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("HaplotypeScore", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Consistency of the site with two (and only two) segregating haplotypes"); }
}

View File

@ -51,26 +51,26 @@ public class QualByDepth implements InfoFieldAnnotation, StandardAnnotation {
return depth;
}
private double genotypeQualByDepth(final Map<String, Genotype> genotypes, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
ArrayList<Double> qualsByDepth = new ArrayList<Double>();
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
// we care only about variant calls
if ( genotype.getValue().isHomRef() )
continue;
StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
if ( context == null )
continue;
qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
}
double mean = 0.0;
for ( Double value : qualsByDepth )
mean += value;
mean /= qualsByDepth.size();
return mean;
}
// private double genotypeQualByDepth(final Map<String, Genotype> genotypes, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
// ArrayList<Double> qualsByDepth = new ArrayList<Double>();
// for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
//
// // we care only about variant calls
// if ( genotype.getValue().isHomRef() )
// continue;
//
// StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
// if ( context == null )
// continue;
//
// qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
// }
//
// double mean = 0.0;
// for ( Double value : qualsByDepth )
// mean += value;
// mean /= qualsByDepth.size();
//
// return mean;
// }
}

View File

@ -10,6 +10,30 @@ import java.util.Random;
public class BaseUtils {
public final static char[] BASES = { 'A', 'C', 'G', 'T' };
public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
public enum Base {
A ( 'A', 0 ),
C ( 'C', 1 ),
G ( 'G', 2 ),
T ( 'T', 3 );
byte b;
int index;
private Base(char base, int index) {
this.b = (byte)base;
this.index = index;
}
public byte getBase() { return b; }
public char getBaseAsChar() { return (char)b; }
public int getIndex() { return index; }
public boolean sameBase(byte o) { return b == o; }
public boolean sameBase(char o) { return b == (byte)o; }
public boolean sameBase(int i) { return index == i; }
}
// todo -- fix me (enums?)
public static final byte DELETION_INDEX = 4;
public static final byte NO_CALL_INDEX = 5; // (this is 'N')

View File

@ -308,4 +308,36 @@ public class MathUtils {
public static double[] normalizeFromLog10(double[] array) {
return normalizeFromLog10(array, false);
}
public static int maxElementIndex(double[] array) {
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
int maxI = -1;
for ( int i = 0; i < array.length; i++ ) {
if ( maxI == -1 || array[i] > array[maxI] )
maxI = i;
}
return maxI;
}
public static double arrayMax(double[] array) {
return array[maxElementIndex(array)];
}
public static double arrayMin(double[] array) {
return array[minElementIndex(array)];
}
public static int minElementIndex(double[] array) {
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
int minI = -1;
for ( int i = 0; i < array.length; i++ ) {
if ( minI == -1 || array[i] < array[minI] )
minI = i;
}
return minI;
}
}

View File

@ -507,4 +507,16 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
private String getQualsString() {
return quals2String(getQuals());
}
public String toString() {
StringBuilder s = new StringBuilder();
s.append(getLocation());
s.append(": ");
for ( PileupElement p : this ) {
s.append((char)p.getBase());
}
return s.toString();
}
}