diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java index e63b7c6ec..ba74f401a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -25,12 +25,19 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -46,71 +53,251 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo private final double alphaDeletionProbability = 1e-3; private final int HAPLOTYPE_SIZE = 80; + private final int minIndelCountForGenotyping; + private final boolean getAlleleListFromVCF; // todo - the following need to be exposed for command line argument control private final double indelHeterozygosity = 1.0/8000; boolean useFlatPriors = true; - boolean DEBUGOUT = false; - + private static final boolean DEBUGOUT = false; + private static final boolean DEBUG = false; private HaplotypeIndelErrorModel model; - private ArrayList sitesVisited; - + private GenomeLoc lastSiteVisited; + private List alleleList; protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, false, DEBUGOUT); - sitesVisited = new ArrayList(); + alleleList = new ArrayList(); + getAlleleListFromVCF = UAC.GET_ALLELES_FROM_VCF; + minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; } + + private ArrayList computeConsensusAlleles(ReferenceContext ref, + Map contexts, + StratifiedAlignmentContext.StratifiedContextType contextType) { + Allele refAllele, altAllele; + GenomeLoc loc = ref.getLocus(); + ArrayList aList = new ArrayList(); + + if (DEBUG) { + System.out.println("'''''''''''''''''''''"); + System.out.println("Loc:"+loc.toString()); + } + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for ( Map.Entry sample : contexts.entrySet() ) { + AlignmentContext context = sample.getValue().getContext(contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return aList; + + for ( Map.Entry sample : contexts.entrySet() ) { + AlignmentContext context = sample.getValue().getContext(contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + + + + for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { + SAMRecord read = p.getRead(); + + if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + + + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + for (String s : consensusIndelStrings.keySet()) { + int cnt = consensusIndelStrings.get(s); + if (s.startsWith(indelString)){ + // case 1: current insertion is prefix of indel in hash map + consensusIndelStrings.put(s,cnt+1); + foundKey = true; + break; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + consensusIndelStrings.remove(s); + consensusIndelStrings.put(indelString,cnt+1); + foundKey = true; + break; + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + consensusIndelStrings.put(indelString,1); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (String s : consensusIndelStrings.keySet()) { + int cnt = consensusIndelStrings.get(s); + if (s.endsWith(indelString)){ + // case 1: current insertion is suffix of indel in hash map + consensusIndelStrings.put(s,cnt+1); + foundKey = true; + break; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is suffix of current insertion + // In this case, new bases are new key. + + consensusIndelStrings.remove(s); + consensusIndelStrings.put(indelString,cnt+1); + foundKey = true; + break; + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + consensusIndelStrings.put(indelString,1); + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to hash map + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + + if (DEBUG) { + int icount = indelPileup.getNumberOfInsertions(); + int dcount = indelPileup.getNumberOfDeletions(); + if (icount + dcount > 0) + { + List> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); + System.out.format("#ins: %d, #del:%d\n", insCount, delCount); + + for (int i=0 ; i < eventStrings.size() ; i++ ) { + System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); + // int k=0; + } + System.out.println(); + } + } + } + + int maxAlleleCnt = 0; + String bestAltAllele = ""; + for (String s : consensusIndelStrings.keySet()) { + int curCnt = consensusIndelStrings.get(s); + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + bestAltAllele = s; + } + if (DEBUG) + System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); + } //gdebug- + + if (maxAlleleCnt < minIndelCountForGenotyping) + return aList; + + if (bestAltAllele.startsWith("D")) { + // get deletion length + int dLen = Integer.valueOf(bestAltAllele.substring(1)); + // get ref bases of accurate deletion + int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart()); + + byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + refAllele = Allele.create(refBases,true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else { + // insertion case + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(bestAltAllele, false); + } + + aList.add(0,refAllele); + aList.add(1,altAllele); + + return aList; + + } public Allele getLikelihoods(RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType, GenotypePriors priors, Map GLs, - Allele alternateAlleleToUse) { // TODO: use this instead of reading the 'indels' ROD from the tracker below + Allele alternateAlleleToUse) { if ( tracker == null ) return null; + + GenomeLoc loc = ref.getLocus(); + Allele refAllele, altAllele; VariantContext vc = null; - for( final VariantContext vc_input : tracker.getVariantContexts(ref, "indels", null, ref.getLocus(), false, false) ) { - if( vc_input != null && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; + if (!ref.getLocus().equals(lastSiteVisited)) { + // starting a new site: clear allele list + alleleList.clear(); + lastSiteVisited = ref.getLocus().clone(); + + + if (getAlleleListFromVCF) { + + for( final VariantContext vc_input : tracker.getVariantContexts(ref, "indels", null, ref.getLocus(), false, false) ) { + if( vc_input != null && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if ( vc == null ) + return null; + + + if (!vc.isIndel()) + return null; + + alleleList.clear(); + for (Allele a : vc.getAlleles()) + alleleList.add(a); + } - } - // ignore places where we don't have a variant - if ( vc == null ) - return null; - - - if (!vc.isIndel()) - return null; - - boolean visitedBefore = false; - synchronized (this) { - - if (sitesVisited.contains(new Integer(vc.getStart())) && - contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE)) - visitedBefore = true; else { - sitesVisited.add(new Integer(vc.getStart())); + alleleList = computeConsensusAlleles(ref,contexts, contextType); + if (alleleList.isEmpty()) + return null; } } - - if (visitedBefore) - return null; - // protect against having an indel too close to the edge of a contig - if (vc.getStart() <= HAPLOTYPE_SIZE) + if (loc.getStart() <= HAPLOTYPE_SIZE) return null; - + if ( !(priors instanceof DiploidIndelGenotypePriors) ) - throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); + throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); - - int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); + refAllele = alleleList.get(0); + altAllele = alleleList.get(1); + int eventLength = refAllele.getBaseString().length() - altAllele.getBaseString().length(); // assume only one alt allele for now if (eventLength<0) eventLength = - eventLength; @@ -118,7 +305,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo int currentHaplotypeSize = HAPLOTYPE_SIZE; // int numSamples = getNSamples(contexts); - List haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize); + List haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), + ref, currentHaplotypeSize); // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. // initialize the GenotypeLikelihoods @@ -144,14 +332,14 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo pileup = context.getBasePileup(); if (pileup != null ) { - haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength); + haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC); double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix); GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), - vc.getReference(), - vc.getAlternateAllele(0), + refAllele, + altAllele, genotypeLikelihoods[0], genotypeLikelihoods[1], genotypeLikelihoods[2], @@ -160,6 +348,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo } } - return vc.getReference(); + return refAllele; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 9f35eb157..50e326346 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -86,6 +86,11 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; + @Argument(fullName = "get_indel_alleles_from_vcf", shortName = "getIndelAllelesFromVCF", doc = "Get reference/alt alleles for indel genotyping from VCF", required = false) + public boolean GET_ALLELES_FROM_VCF = false; + + @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) + public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index ed945878d..4ef8214cc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -132,8 +132,7 @@ public class HaplotypeIndelErrorModel { return -10.0*Math.log10(prob); } - public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read, - VariantContext vc, int eventLength) { + public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { long numStartClippedBases = 0; long numEndClippedBases = 0; @@ -419,8 +418,7 @@ public class HaplotypeIndelErrorModel { } - public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, - VariantContext vc, int eventLength){ + public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC){ double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; int i=0; @@ -429,7 +427,7 @@ public class HaplotypeIndelErrorModel { // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent for (int j=0; j < haplotypesInVC.size(); j++) { - readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength); + readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read); if (DEBUG) { System.out.print(read.getReadName()+" "); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 85c505935..675cef5ff 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -29,6 +29,7 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -95,22 +96,35 @@ public class Haplotype { return isReference; } - public static List makeHaplotypeListFromVariantContextAlleles(VariantContext vc, ReferenceContext ref, final int haplotypeSize) { + public static List makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize) { List haplotypeList = new ArrayList(); + Allele refAllele = null; + + for (Allele a:alleleList) { + if (a.isReference()) { + refAllele = a; + break; + } + + } + + if (refAllele == null) + throw new ReviewedStingException("BUG: no ref alleles in input to makeHaplotypeListfrom Alleles at loc: "+ startPos); + byte[] refBases = ref.getBases(); int numPrefBases = LEFT_WINDOW_SIZE; - int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart()); + int startIdxInReference = (int)(1+startPos-numPrefBases-ref.getWindow().getStart()); //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); byte[] basesAfterVariant = Arrays.copyOfRange(refBases, - startIdxInReference+numPrefBases+vc.getReference().getBases().length, refBases.length); + startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length); // Create location for all haplotypes @@ -120,7 +134,7 @@ public class Haplotype { GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); - for (Allele a : vc.getAlleles()) { + for (Allele a : alleleList) { byte[] alleleBases = a.getBases(); // use string concatenation