Minor useful changes to BaseUtils and MathUtils to support a new haplotype score annotation that determines to the two most likely haplotypes over an interval and scores variants by their consistency with a diploid model. Appears to be useful.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3085 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e9e53f68ab
commit
b8ab74a6dc
|
|
@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
|||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* Useful class for storing different AlignmentContexts
|
||||
|
|
@ -53,9 +54,17 @@ public class StratifiedAlignmentContext {
|
|||
|
||||
private GenomeLoc loc;
|
||||
private AlignmentContext[] contexts = new AlignmentContext[StratifiedContextType.values().length];
|
||||
|
||||
// todo -- why are you storing reads separately each time? There's a ReadBackedPileup object that's supposed to handle this
|
||||
private ArrayList<SAMRecord>[] reads = new ArrayList[StratifiedContextType.values().length];
|
||||
private ArrayList<Integer>[] offsets = new ArrayList[StratifiedContextType.values().length];
|
||||
|
||||
//
|
||||
// accessors
|
||||
//
|
||||
public GenomeLoc getLocation() { return loc; }
|
||||
public ArrayList<SAMRecord> getReads(StratifiedContextType type) { return reads[type.ordinal()]; }
|
||||
public ArrayList<Integer> getOffsets(StratifiedContextType type) { return offsets[type.ordinal()]; }
|
||||
|
||||
public StratifiedAlignmentContext(GenomeLoc loc) {
|
||||
this.loc = loc;
|
||||
|
|
@ -65,10 +74,10 @@ public class StratifiedAlignmentContext {
|
|||
}
|
||||
}
|
||||
|
||||
public AlignmentContext getContext(StratifiedContextType context) {
|
||||
int index = context.ordinal();
|
||||
public AlignmentContext getContext(StratifiedContextType type) {
|
||||
int index = type.ordinal();
|
||||
if ( contexts[index] == null )
|
||||
contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, reads[index], offsets[index]));
|
||||
contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, getReads(type), getOffsets(type)));
|
||||
return contexts[index];
|
||||
}
|
||||
|
||||
|
|
@ -214,4 +223,20 @@ public class StratifiedAlignmentContext {
|
|||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
public static AlignmentContext joinContexts(Collection<StratifiedAlignmentContext> contexts, StratifiedContextType type) {
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
GenomeLoc loc = null;
|
||||
for ( StratifiedAlignmentContext context : contexts ) {
|
||||
loc = context.getLocation();
|
||||
reads.addAll(context.getReads(type));
|
||||
offsets.addAll(context.getOffsets(type));
|
||||
}
|
||||
|
||||
if ( loc == null )
|
||||
throw new StingException("BUG: joinContexts requires at least one context to join");
|
||||
return new AlignmentContext(loc, new ReadBackedPileup(loc, reads, offsets));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
||||
public class HaplotypeScore implements InfoFieldAnnotation, WorkInProgressAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( !vc.isBiallelic() || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values(), StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
|
||||
int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, 10);
|
||||
int contextSize = contextWingSize * 2 + 1;
|
||||
|
||||
int refMiddle = (int)(ref.getWindow().size() - 1) / 2;
|
||||
int refStart = refMiddle - contextWingSize;
|
||||
int refStop = refMiddle + contextWingSize + 1;
|
||||
String refString = new String(ref.getBases()).substring(refStart, refStop);
|
||||
Consensus refConsensus = new Consensus(refString.getBytes(), 60);
|
||||
|
||||
ReadBackedPileup altPileup = getPileupOfAllele(vc.getAlternateAllele(0), context.getBasePileup());
|
||||
Consensus altConsensus = new Consensus(altPileup, contextSize);
|
||||
|
||||
//System.exit(1);
|
||||
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
|
||||
double score = scoreReadsAgainstHaplotypes(Arrays.asList(refConsensus, altConsensus), context.getBasePileup(), contextSize);
|
||||
|
||||
// return the score
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyName(), String.format("%.2f", score));
|
||||
return map;
|
||||
}
|
||||
|
||||
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
|
||||
private double scoreReadsAgainstHaplotypes(List<Consensus> haplotypes, ReadBackedPileup pileup, int contextSize) {
|
||||
System.out.printf("REF: %s%n", haplotypes.get(0));
|
||||
System.out.printf("ALT: %s%n", haplotypes.get(1));
|
||||
|
||||
double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()];
|
||||
for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) {
|
||||
SAMRecord read = p.getRead();
|
||||
int readOffsetFromPileup = p.getOffset();
|
||||
|
||||
System.out.printf("--------------------------------------------- Read %s%n", read.getReadName());
|
||||
double m = 10000000;
|
||||
for ( int i = 0; i < haplotypes.size(); i++ ) {
|
||||
Consensus haplotype = haplotypes.get(i);
|
||||
int start = readOffsetFromPileup - (contextSize - 1)/2;
|
||||
double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype);
|
||||
haplotypeScores[p.getPileupOffset()][i] = score;
|
||||
System.out.printf(" vs. haplotype %d = %f%n", i, score);
|
||||
m = Math.min(score, m);
|
||||
}
|
||||
System.out.printf(" => best score was %f%n", m);
|
||||
}
|
||||
|
||||
double overallScore = 0.0;
|
||||
for ( double[] readHaplotypeScores : haplotypeScores ) {
|
||||
overallScore += MathUtils.arrayMin(readHaplotypeScores);
|
||||
}
|
||||
|
||||
return overallScore;
|
||||
}
|
||||
|
||||
private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Consensus haplotype ) {
|
||||
double log10LExpected = 0.0;
|
||||
double log10LMismatches = 0.0;
|
||||
|
||||
for ( int i = 0; i < contextSize; i++ ) {
|
||||
int baseOffset = i + baseOffsetStart;
|
||||
if ( baseOffset < 0 )
|
||||
continue;
|
||||
if ( baseOffset >= read.getReadLength() )
|
||||
break;
|
||||
|
||||
byte haplotypeBase = haplotype.bases[i];
|
||||
byte readBase = read.getReadBases()[baseOffset];
|
||||
if ( ! BaseUtils.basesAreEqual(readBase, haplotypeBase ) ) {
|
||||
log10LMismatches += read.getBaseQualities()[baseOffset];
|
||||
}
|
||||
|
||||
//System.out.printf("Read %s: scoring %c vs. %c => %f%n", read.getReadName(), (char)haplotypeBase, (char)readBase, log10LMismatches);
|
||||
}
|
||||
|
||||
return log10LMismatches; // - log10LExpected;
|
||||
}
|
||||
|
||||
|
||||
private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length];
|
||||
static {
|
||||
for ( int i = 0; i < BaseUtils.Base.values().length; i++ )
|
||||
FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length);
|
||||
}
|
||||
|
||||
private class Consensus {
|
||||
byte[] bases = null;
|
||||
byte[] quals = null;
|
||||
|
||||
/**
|
||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
||||
*
|
||||
* @param bases
|
||||
* @param qual
|
||||
*/
|
||||
Consensus(byte[] bases, int qual) {
|
||||
this.bases = bases;
|
||||
quals = new byte[bases.length];
|
||||
Arrays.fill(quals, (byte)qual);
|
||||
}
|
||||
|
||||
Consensus(ReadBackedPileup pileup, int contextSize ) {
|
||||
this.bases = new byte[contextSize];
|
||||
this.quals = new byte[contextSize];
|
||||
calculateConsensusOverWindow(pileup, contextSize, (contextSize - 1) / 2);
|
||||
}
|
||||
|
||||
private void calculateConsensusOverWindow(ReadBackedPileup pileup, int contextSize, int pileupOffset) {
|
||||
// for each context position
|
||||
for ( int i = 0; i < contextSize; i++ ) {
|
||||
int offsetFromPileup = i - pileupOffset;
|
||||
ReadBackedPileup offsetPileup = pileupAtOffset(pileup, offsetFromPileup);
|
||||
System.out.printf("pileup is %s%n", offsetPileup);
|
||||
BaseQual bq = calcConsensusAtLocus(offsetPileup, FLAT_BASE_PRIORS);
|
||||
this.bases[i] = bq.getBase();
|
||||
this.quals[i] = bq.getQual();
|
||||
System.out.printf(" At %d: offset %d bq = %c / %d%n", i, offsetFromPileup, (char)bq.getBase(), bq.getQual());
|
||||
}
|
||||
}
|
||||
|
||||
private BaseQual calcConsensusAtLocus( ReadBackedPileup pileup, double[] log10priors ) {
|
||||
double[] log10BaseLikelihoods = new double[BaseUtils.Base.values().length];
|
||||
|
||||
// loop over a, c, g, t and determine the most likely hypothesis
|
||||
for ( BaseUtils.Base base : BaseUtils.Base.values() ) {
|
||||
double log10L = log10priors[base.getIndex()];
|
||||
|
||||
for ( PileupElement p : pileup ) {
|
||||
double baseP = QualityUtils.qualToProb(p.getQual());
|
||||
double L = base.sameBase(p.getBase()) ? baseP : 1 - baseP;
|
||||
log10L += Math.log10(L);
|
||||
}
|
||||
|
||||
log10BaseLikelihoods[base.getIndex()] = log10L;
|
||||
}
|
||||
|
||||
double[] posteriors = MathUtils.normalizeFromLog10(log10BaseLikelihoods, false);
|
||||
int mostLikelyIndex = MathUtils.maxElementIndex(posteriors);
|
||||
byte mostLikelyBase = BaseUtils.Base.values()[mostLikelyIndex].getBase(); // get the most likely option
|
||||
double MAX_CONSENSUS_QUALITY = 0.000001;
|
||||
byte qual = QualityUtils.probToQual(posteriors[mostLikelyIndex],MAX_CONSENSUS_QUALITY); // call posterior calculator here over L over bases
|
||||
return new BaseQual(mostLikelyBase, qual);
|
||||
}
|
||||
|
||||
public String toString() { return new String(this.bases); }
|
||||
}
|
||||
|
||||
private class BaseQual extends Pair<Byte, Byte> {
|
||||
public BaseQual(byte base, byte qual) {
|
||||
super(base, qual);
|
||||
}
|
||||
|
||||
public byte getBase() { return getFirst(); }
|
||||
public byte getQual() { return getSecond(); }
|
||||
}
|
||||
|
||||
private static ReadBackedPileup pileupAtOffset(ReadBackedPileup pileup, int offsetFromPileup) {
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
// go through the pileup, read by read, collecting up the context at offset
|
||||
for ( PileupElement p : pileup ) {
|
||||
// not safe -- doesn't work for indel-containing reads!
|
||||
|
||||
// whole algorithm should be restructured to handle other reads in the window or use LocusIteratorByState
|
||||
SAMRecord read = p.getRead();
|
||||
int readOffsetInPileup = p.getOffset();
|
||||
int neededReadOffset = readOffsetInPileup + offsetFromPileup;
|
||||
if ( neededReadOffset >= 0 && neededReadOffset < read.getReadLength() ) {
|
||||
reads.add(p.getRead());
|
||||
offsets.add(neededReadOffset);
|
||||
}
|
||||
}
|
||||
|
||||
return new ReadBackedPileup(pileup.getLocation(), reads, offsets);
|
||||
}
|
||||
|
||||
private static ReadBackedPileup getPileupOfAllele( Allele allele, ReadBackedPileup pileup ) {
|
||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||
byte alleleBase = allele.getBases()[0]; // assumes SNP
|
||||
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( p.getBase() == alleleBase ) {
|
||||
filteredPileup.add(p);
|
||||
}
|
||||
}
|
||||
|
||||
return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
|
||||
}
|
||||
|
||||
public String getKeyName() { return "HaplotypeScore"; }
|
||||
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("HaplotypeScore", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Consistency of the site with two (and only two) segregating haplotypes"); }
|
||||
}
|
||||
|
|
@ -51,26 +51,26 @@ public class QualByDepth implements InfoFieldAnnotation, StandardAnnotation {
|
|||
return depth;
|
||||
}
|
||||
|
||||
private double genotypeQualByDepth(final Map<String, Genotype> genotypes, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
|
||||
ArrayList<Double> qualsByDepth = new ArrayList<Double>();
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
|
||||
// we care only about variant calls
|
||||
if ( genotype.getValue().isHomRef() )
|
||||
continue;
|
||||
|
||||
StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
if ( context == null )
|
||||
continue;
|
||||
|
||||
qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
}
|
||||
|
||||
double mean = 0.0;
|
||||
for ( Double value : qualsByDepth )
|
||||
mean += value;
|
||||
mean /= qualsByDepth.size();
|
||||
|
||||
return mean;
|
||||
}
|
||||
// private double genotypeQualByDepth(final Map<String, Genotype> genotypes, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
|
||||
// ArrayList<Double> qualsByDepth = new ArrayList<Double>();
|
||||
// for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
//
|
||||
// // we care only about variant calls
|
||||
// if ( genotype.getValue().isHomRef() )
|
||||
// continue;
|
||||
//
|
||||
// StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
// if ( context == null )
|
||||
// continue;
|
||||
//
|
||||
// qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
// }
|
||||
//
|
||||
// double mean = 0.0;
|
||||
// for ( Double value : qualsByDepth )
|
||||
// mean += value;
|
||||
// mean /= qualsByDepth.size();
|
||||
//
|
||||
// return mean;
|
||||
// }
|
||||
}
|
||||
|
|
@ -10,6 +10,30 @@ import java.util.Random;
|
|||
public class BaseUtils {
|
||||
public final static char[] BASES = { 'A', 'C', 'G', 'T' };
|
||||
public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
|
||||
|
||||
public enum Base {
|
||||
A ( 'A', 0 ),
|
||||
C ( 'C', 1 ),
|
||||
G ( 'G', 2 ),
|
||||
T ( 'T', 3 );
|
||||
|
||||
byte b;
|
||||
int index;
|
||||
private Base(char base, int index) {
|
||||
this.b = (byte)base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public byte getBase() { return b; }
|
||||
public char getBaseAsChar() { return (char)b; }
|
||||
public int getIndex() { return index; }
|
||||
|
||||
public boolean sameBase(byte o) { return b == o; }
|
||||
public boolean sameBase(char o) { return b == (byte)o; }
|
||||
public boolean sameBase(int i) { return index == i; }
|
||||
}
|
||||
|
||||
|
||||
// todo -- fix me (enums?)
|
||||
public static final byte DELETION_INDEX = 4;
|
||||
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
|
||||
|
|
|
|||
|
|
@ -308,4 +308,36 @@ public class MathUtils {
|
|||
public static double[] normalizeFromLog10(double[] array) {
|
||||
return normalizeFromLog10(array, false);
|
||||
}
|
||||
|
||||
public static int maxElementIndex(double[] array) {
|
||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int maxI = -1;
|
||||
for ( int i = 0; i < array.length; i++ ) {
|
||||
if ( maxI == -1 || array[i] > array[maxI] )
|
||||
maxI = i;
|
||||
}
|
||||
|
||||
return maxI;
|
||||
}
|
||||
|
||||
public static double arrayMax(double[] array) {
|
||||
return array[maxElementIndex(array)];
|
||||
}
|
||||
|
||||
public static double arrayMin(double[] array) {
|
||||
return array[minElementIndex(array)];
|
||||
}
|
||||
|
||||
public static int minElementIndex(double[] array) {
|
||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int minI = -1;
|
||||
for ( int i = 0; i < array.length; i++ ) {
|
||||
if ( minI == -1 || array[i] < array[minI] )
|
||||
minI = i;
|
||||
}
|
||||
|
||||
return minI;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -507,4 +507,16 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
|
|||
private String getQualsString() {
|
||||
return quals2String(getQuals());
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder();
|
||||
s.append(getLocation());
|
||||
s.append(": ");
|
||||
|
||||
for ( PileupElement p : this ) {
|
||||
s.append((char)p.getBase());
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue