OK, bad idea to add new temporary annotation - revert to keep integration tests hapy.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3886 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
71bfb1ee35
commit
894623858d
|
|
@ -1,306 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
|
||||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
|
||||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
|
||||||
import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement;
|
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2010 The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
||||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
|
||||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
|
||||||
import org.broadinstitute.sting.utils.*;
|
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
|
||||||
import org.broadinstitute.sting.utils.pileup.*;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
|
||||||
|
|
||||||
// todo -- rename to haplotype penalty
|
|
||||||
public class MyHaplotypeScore implements InfoFieldAnnotation, StandardAnnotation {
|
|
||||||
private final static boolean DEBUG = false;
|
|
||||||
private final static int MIN_CONTEXT_WING_SIZE = 10;
|
|
||||||
private final static String REGEXP_WILDCARD = ".";
|
|
||||||
|
|
||||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
|
||||||
if ( !vc.isBiallelic() || !vc.isSNP() || stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here
|
|
||||||
return null;
|
|
||||||
|
|
||||||
AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values());
|
|
||||||
|
|
||||||
int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, MIN_CONTEXT_WING_SIZE);
|
|
||||||
int contextSize = contextWingSize * 2 + 1;
|
|
||||||
|
|
||||||
// Compute all haplotypes consistent with the current read pileup
|
|
||||||
List<HaplotypePair> haplotypePairs = computeHaplotypes(context.getBasePileup(), contextSize);
|
|
||||||
|
|
||||||
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
|
|
||||||
double score = scoreReadsAgainstHaplotypes(haplotypePairs, context.getBasePileup(), contextSize);
|
|
||||||
|
|
||||||
// return the score
|
|
||||||
Map<String, Object> map = new HashMap<String, Object>();
|
|
||||||
map.put(getKeyNames().get(0), String.format("%.2f", score));
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<HaplotypePair> computeHaplotypes(ReadBackedPileup pileup, int contextSize) {
|
|
||||||
// Compute all possible haplotypes consistent with current pileup
|
|
||||||
ArrayList<HaplotypePair> haplotypeList = new ArrayList<HaplotypePair>();
|
|
||||||
ArrayList<Integer> readsPerHaplotype = new ArrayList<Integer>();
|
|
||||||
|
|
||||||
for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) {
|
|
||||||
SAMRecord read = p.getRead();
|
|
||||||
int readOffsetFromPileup = p.getOffset();
|
|
||||||
int baseOffsetStart = readOffsetFromPileup - (contextSize - 1)/2;
|
|
||||||
|
|
||||||
byte[] haplotypeBases = new byte[contextSize];
|
|
||||||
|
|
||||||
for(int i=0; i < contextSize; i++) {
|
|
||||||
haplotypeBases[i] = REGEXP_WILDCARD.getBytes()[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
int numUsedLocations = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < contextSize; i++ ) {
|
|
||||||
int baseOffset = i + baseOffsetStart;
|
|
||||||
if ( baseOffset < 0 )
|
|
||||||
continue;
|
|
||||||
if ( baseOffset >= read.getReadLength() )
|
|
||||||
break;
|
|
||||||
|
|
||||||
haplotypeBases[i] = read.getReadBases()[baseOffset];
|
|
||||||
numUsedLocations++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (DEBUG)
|
|
||||||
// System.out.println(new String(haplotypeBases));
|
|
||||||
|
|
||||||
boolean foundHaplotypeMatch = false;
|
|
||||||
for (int pos = 0; pos < haplotypeList.size(); pos++ ) {
|
|
||||||
HaplotypePair elem = haplotypeList.get(pos);
|
|
||||||
|
|
||||||
if (patternMatches(elem.getHaplotype().toString(), new String(haplotypeBases))) {
|
|
||||||
// this haplotype is consistent with element in list: store whichever has more bases.
|
|
||||||
if (elem.getWeight() < numUsedLocations) {
|
|
||||||
// current read has more bases: remove old and keep new element
|
|
||||||
haplotypeList.set(pos,new HaplotypePair(new Haplotype(haplotypeBases,contextSize), numUsedLocations));
|
|
||||||
}
|
|
||||||
// increment by one the count of reads for current haplotype
|
|
||||||
Integer currReads = readsPerHaplotype.get(pos);
|
|
||||||
currReads++;
|
|
||||||
readsPerHaplotype.set(pos,currReads);
|
|
||||||
foundHaplotypeMatch = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!foundHaplotypeMatch) {
|
|
||||||
haplotypeList.add(new HaplotypePair(new Haplotype(haplotypeBases,contextSize), numUsedLocations));
|
|
||||||
readsPerHaplotype.add(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now retrieve two most popular haplotypes
|
|
||||||
// TODO - quick and dirty solution, could use better data structures to do this automatically
|
|
||||||
int bestIdx=0, secondBestIdx=0, bestIdxVal=-1, secondBestIdxVal = -1;
|
|
||||||
|
|
||||||
for (int k=0; k < haplotypeList.size(); k++) {
|
|
||||||
int readCount = readsPerHaplotype.get(k);
|
|
||||||
if (readCount >= bestIdxVal) {
|
|
||||||
secondBestIdx = bestIdx;
|
|
||||||
secondBestIdxVal = bestIdxVal;
|
|
||||||
bestIdx = k;
|
|
||||||
bestIdxVal = readCount;
|
|
||||||
}
|
|
||||||
else if (readCount >= secondBestIdxVal) {
|
|
||||||
// check if current is second best
|
|
||||||
secondBestIdx = k;
|
|
||||||
secondBestIdxVal = readCount;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return Arrays.asList(haplotypeList.get(bestIdx),haplotypeList.get(secondBestIdx));
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean patternMatches(String a, String b) {
|
|
||||||
// fast poor man's version of Pattern.matches
|
|
||||||
if (a.length() != b.length())
|
|
||||||
throw new StingException("String a and b must be of same length");
|
|
||||||
|
|
||||||
char chA, chB;
|
|
||||||
char wc = REGEXP_WILDCARD.charAt(0);
|
|
||||||
|
|
||||||
for (int i=0; i < a.length(); i++) {
|
|
||||||
chA = a.charAt(i);
|
|
||||||
chB = b.charAt(i);
|
|
||||||
|
|
||||||
if (chA == wc)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (chB == wc)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (chA != chB)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
|
|
||||||
private double scoreReadsAgainstHaplotypes(List<HaplotypePair> haplotypePairs, ReadBackedPileup pileup, int contextSize) {
|
|
||||||
// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(0));
|
|
||||||
// if ( DEBUG ) System.out.printf("HAP1: %s%n", haplotypes.get(1));
|
|
||||||
|
|
||||||
double[][] haplotypeScores = new double[pileup.size()][haplotypePairs.size()];
|
|
||||||
for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) {
|
|
||||||
SAMRecord read = p.getRead();
|
|
||||||
int readOffsetFromPileup = p.getOffset();
|
|
||||||
|
|
||||||
if ( DEBUG ) System.out.printf("--------------------------------------------- Read %s%n", read.getReadName());
|
|
||||||
double m = 10000000;
|
|
||||||
for ( int i = 0; i < haplotypePairs.size(); i++ ) {
|
|
||||||
Haplotype haplotype = haplotypePairs.get(i).getHaplotype();
|
|
||||||
int start = readOffsetFromPileup - (contextSize - 1)/2;
|
|
||||||
double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype);
|
|
||||||
haplotypeScores[p.getPileupOffset()][i] = score;
|
|
||||||
if ( DEBUG ) System.out.printf(" vs. haplotype %d = %f%n", i, score);
|
|
||||||
m = Math.min(score, m);
|
|
||||||
}
|
|
||||||
if ( DEBUG ) System.out.printf(" => best score was %f%n", m);
|
|
||||||
}
|
|
||||||
|
|
||||||
double overallScore = 0.0;
|
|
||||||
for ( double[] readHaplotypeScores : haplotypeScores ) {
|
|
||||||
overallScore += MathUtils.arrayMin(readHaplotypeScores);
|
|
||||||
}
|
|
||||||
|
|
||||||
return overallScore;
|
|
||||||
}
|
|
||||||
|
|
||||||
private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Haplotype haplotype ) {
|
|
||||||
double expected = 0.0;
|
|
||||||
double mismatches = 0.0;
|
|
||||||
|
|
||||||
// What's the expected mismatch rate under the model that this read is actually sampled from
|
|
||||||
// this haplotype? Let's assume the consensus base c is a random choice one of A, C, G, or T, and that
|
|
||||||
// the observed base is actually from a c with an error rate e. Since e is the rate at which we'd
|
|
||||||
// see a miscalled c, the expected mismatch rate is really e. So the expected number of mismatches
|
|
||||||
// is just sum_i e_i for i from 1..n for n sites
|
|
||||||
//
|
|
||||||
// Now, what's the probabilistic sum of mismatches? Suppose that the base b is equal to c. Well, it could
|
|
||||||
// actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != c, then
|
|
||||||
// the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch.
|
|
||||||
// so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n
|
|
||||||
for ( int i = 0; i < contextSize; i++ ) {
|
|
||||||
int baseOffset = i + baseOffsetStart;
|
|
||||||
if ( baseOffset < 0 )
|
|
||||||
continue;
|
|
||||||
if ( baseOffset >= read.getReadLength() )
|
|
||||||
break;
|
|
||||||
|
|
||||||
byte haplotypeBase = haplotype.bases[i];
|
|
||||||
byte readBase = read.getReadBases()[baseOffset];
|
|
||||||
|
|
||||||
boolean matched = BaseUtils.basesAreEqual(readBase, haplotypeBase );
|
|
||||||
double e = QualityUtils.qualToErrorProb(read.getBaseQualities()[baseOffset]);
|
|
||||||
expected += e;
|
|
||||||
mismatches += matched ? e : 1 - e / 3;
|
|
||||||
|
|
||||||
// a more sophisticated calculation would include the reference quality, but it's nice to actually penalize
|
|
||||||
// the mismatching of poorly determined regions of the consensus
|
|
||||||
|
|
||||||
if ( DEBUG ) System.out.printf("Read %s: scoring %c vs. %c => e = %f Q%d esum %f vs. msum %f%n",
|
|
||||||
read.getReadName(), (char)haplotypeBase, (char)readBase, e, read.getBaseQualities()[baseOffset], expected, mismatches);
|
|
||||||
}
|
|
||||||
|
|
||||||
return mismatches - expected;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length];
|
|
||||||
static {
|
|
||||||
for ( int i = 0; i < BaseUtils.Base.values().length; i++ )
|
|
||||||
FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length);
|
|
||||||
}
|
|
||||||
|
|
||||||
private class Haplotype {
|
|
||||||
byte[] bases = null;
|
|
||||||
byte[] quals = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
|
||||||
*
|
|
||||||
* @param bases
|
|
||||||
* @param qual
|
|
||||||
*/
|
|
||||||
Haplotype(byte[] bases, int qual) {
|
|
||||||
this.bases = bases;
|
|
||||||
quals = new byte[bases.length];
|
|
||||||
Arrays.fill(quals, (byte)qual);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() { return new String(this.bases); }
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class HaplotypePair extends Pair<Haplotype,Integer> {
|
|
||||||
public HaplotypePair(Haplotype h, Integer i) {
|
|
||||||
super(h, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Haplotype getHaplotype() { return getFirst(); }
|
|
||||||
public Integer getWeight() { return getSecond(); }
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getKeyNames() { return Arrays.asList("MyHaplotypeScore"); }
|
|
||||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MyHaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with two (and only two) segregating haplotypes")); }
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue