From a1653f0c838fa6fbd1c5b782f9e44ff3a39453d6 Mon Sep 17 00:00:00 2001 From: delangel Date: Wed, 22 Dec 2010 02:38:06 +0000 Subject: [PATCH] Another major redo for indel genotyper: this time, add ability to do allele and variant discovery, and don't rely necessarily on external vcf's to provide candidate variants and alleles (e.g. by using IndelGenotyperV2). This has two major advantages: speed, and more fine-grained control of discovery process. Code is still under test and analysis but this version should be hopefully stable. Ability to genotype candidate variants from input vcf is retained and can be turned on by command line argument but is disabled by default. Code, by default, will build a consensus of the most common indel event at a pileup. If that consensus allele has a count bigger than N (=5 by default), we proceed to genotype by computing probabilistic realigmment, AF distribution etc. and possibly emmiting a call. Needed for this, also added ability to build haplotypes from list of alleles instead of from a variant context. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4893 348d0f76-0448-11de-a6fe-93d51630548a --- ...elGenotypeLikelihoodsCalculationModel.java | 268 +++++++++++++++--- .../genotyper/UnifiedArgumentCollection.java | 5 + .../indels/HaplotypeIndelErrorModel.java | 8 +- .../sting/utils/genotype/Haplotype.java | 22 +- 4 files changed, 254 insertions(+), 49 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java index e63b7c6ec..ba74f401a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -25,12 +25,19 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -46,71 +53,251 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo private final double alphaDeletionProbability = 1e-3; private final int HAPLOTYPE_SIZE = 80; + private final int minIndelCountForGenotyping; + private final boolean getAlleleListFromVCF; // todo - the following need to be exposed for command line argument control private final double indelHeterozygosity = 1.0/8000; boolean useFlatPriors = true; - boolean DEBUGOUT = false; - + private static final boolean DEBUGOUT = false; + private static final boolean DEBUG = false; private HaplotypeIndelErrorModel model; - private ArrayList sitesVisited; - + private GenomeLoc lastSiteVisited; + private List alleleList; protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, false, DEBUGOUT); - sitesVisited = new ArrayList(); + alleleList = new ArrayList(); + getAlleleListFromVCF = UAC.GET_ALLELES_FROM_VCF; + minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; } + + private ArrayList computeConsensusAlleles(ReferenceContext ref, + Map contexts, + StratifiedAlignmentContext.StratifiedContextType contextType) { + Allele refAllele, altAllele; + GenomeLoc loc = ref.getLocus(); + ArrayList aList = new ArrayList(); + + if (DEBUG) { + System.out.println("'''''''''''''''''''''"); + System.out.println("Loc:"+loc.toString()); + } + HashMap consensusIndelStrings = new HashMap(); + + int insCount = 0, delCount = 0; + // quick check of total number of indels in pileup + for ( Map.Entry sample : contexts.entrySet() ) { + AlignmentContext context = sample.getValue().getContext(contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + insCount += indelPileup.getNumberOfInsertions(); + delCount += indelPileup.getNumberOfDeletions(); + } + + if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping) + return aList; + + for ( Map.Entry sample : contexts.entrySet() ) { + AlignmentContext context = sample.getValue().getContext(contextType); + + final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup(); + + + + + for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { + SAMRecord read = p.getRead(); + + if (DEBUG && p.isIndel()) { + System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", + read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), + p.getEventLength(),p.getType().toString(), p.getEventBases()); + } + + + String indelString = p.getEventBases(); + if (p.isInsertion()) { + boolean foundKey = false; + if (read.getAlignmentEnd() == loc.getStart()) { + // first corner condition: a read has an insertion at the end, and we're right at the insertion. + // In this case, the read could have any of the inserted bases and we need to build a consensus + for (String s : consensusIndelStrings.keySet()) { + int cnt = consensusIndelStrings.get(s); + if (s.startsWith(indelString)){ + // case 1: current insertion is prefix of indel in hash map + consensusIndelStrings.put(s,cnt+1); + foundKey = true; + break; + } + else if (indelString.startsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion + // In this case, new bases are new key. + consensusIndelStrings.remove(s); + consensusIndelStrings.put(indelString,cnt+1); + foundKey = true; + break; + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + consensusIndelStrings.put(indelString,1); + + } + else if (read.getAlignmentStart() == loc.getStart()+1) { + // opposite corner condition: read will start at current locus with an insertion + for (String s : consensusIndelStrings.keySet()) { + int cnt = consensusIndelStrings.get(s); + if (s.endsWith(indelString)){ + // case 1: current insertion is suffix of indel in hash map + consensusIndelStrings.put(s,cnt+1); + foundKey = true; + break; + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is suffix of current insertion + // In this case, new bases are new key. + + consensusIndelStrings.remove(s); + consensusIndelStrings.put(indelString,cnt+1); + foundKey = true; + break; + } + } + if (!foundKey) + // none of the above: event bases not supported by previous table, so add new key + consensusIndelStrings.put(indelString,1); + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to hash map + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); + + } + } + + if (DEBUG) { + int icount = indelPileup.getNumberOfInsertions(); + int dcount = indelPileup.getNumberOfDeletions(); + if (icount + dcount > 0) + { + List> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases()); + System.out.format("#ins: %d, #del:%d\n", insCount, delCount); + + for (int i=0 ; i < eventStrings.size() ; i++ ) { + System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second); + // int k=0; + } + System.out.println(); + } + } + } + + int maxAlleleCnt = 0; + String bestAltAllele = ""; + for (String s : consensusIndelStrings.keySet()) { + int curCnt = consensusIndelStrings.get(s); + if (curCnt > maxAlleleCnt) { + maxAlleleCnt = curCnt; + bestAltAllele = s; + } + if (DEBUG) + System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); + } //gdebug- + + if (maxAlleleCnt < minIndelCountForGenotyping) + return aList; + + if (bestAltAllele.startsWith("D")) { + // get deletion length + int dLen = Integer.valueOf(bestAltAllele.substring(1)); + // get ref bases of accurate deletion + int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart()); + + byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); + refAllele = Allele.create(refBases,true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + else { + // insertion case + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(bestAltAllele, false); + } + + aList.add(0,refAllele); + aList.add(1,altAllele); + + return aList; + + } public Allele getLikelihoods(RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType, GenotypePriors priors, Map GLs, - Allele alternateAlleleToUse) { // TODO: use this instead of reading the 'indels' ROD from the tracker below + Allele alternateAlleleToUse) { if ( tracker == null ) return null; + + GenomeLoc loc = ref.getLocus(); + Allele refAllele, altAllele; VariantContext vc = null; - for( final VariantContext vc_input : tracker.getVariantContexts(ref, "indels", null, ref.getLocus(), false, false) ) { - if( vc_input != null && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; + if (!ref.getLocus().equals(lastSiteVisited)) { + // starting a new site: clear allele list + alleleList.clear(); + lastSiteVisited = ref.getLocus().clone(); + + + if (getAlleleListFromVCF) { + + for( final VariantContext vc_input : tracker.getVariantContexts(ref, "indels", null, ref.getLocus(), false, false) ) { + if( vc_input != null && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if ( vc == null ) + return null; + + + if (!vc.isIndel()) + return null; + + alleleList.clear(); + for (Allele a : vc.getAlleles()) + alleleList.add(a); + } - } - // ignore places where we don't have a variant - if ( vc == null ) - return null; - - - if (!vc.isIndel()) - return null; - - boolean visitedBefore = false; - synchronized (this) { - - if (sitesVisited.contains(new Integer(vc.getStart())) && - contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE)) - visitedBefore = true; else { - sitesVisited.add(new Integer(vc.getStart())); + alleleList = computeConsensusAlleles(ref,contexts, contextType); + if (alleleList.isEmpty()) + return null; } } - - if (visitedBefore) - return null; - // protect against having an indel too close to the edge of a contig - if (vc.getStart() <= HAPLOTYPE_SIZE) + if (loc.getStart() <= HAPLOTYPE_SIZE) return null; - + if ( !(priors instanceof DiploidIndelGenotypePriors) ) - throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); + throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); - - int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); + refAllele = alleleList.get(0); + altAllele = alleleList.get(1); + int eventLength = refAllele.getBaseString().length() - altAllele.getBaseString().length(); // assume only one alt allele for now if (eventLength<0) eventLength = - eventLength; @@ -118,7 +305,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo int currentHaplotypeSize = HAPLOTYPE_SIZE; // int numSamples = getNSamples(contexts); - List haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize); + List haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), + ref, currentHaplotypeSize); // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. // initialize the GenotypeLikelihoods @@ -144,14 +332,14 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo pileup = context.getBasePileup(); if (pileup != null ) { - haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength); + haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC); double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix); GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), - vc.getReference(), - vc.getAlternateAllele(0), + refAllele, + altAllele, genotypeLikelihoods[0], genotypeLikelihoods[1], genotypeLikelihoods[2], @@ -160,6 +348,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo } } - return vc.getReference(); + return refAllele; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 9f35eb157..50e326346 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -86,6 +86,11 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; + @Argument(fullName = "get_indel_alleles_from_vcf", shortName = "getIndelAllelesFromVCF", doc = "Get reference/alt alleles for indel genotyping from VCF", required = false) + public boolean GET_ALLELES_FROM_VCF = false; + + @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) + public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index ed945878d..4ef8214cc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -132,8 +132,7 @@ public class HaplotypeIndelErrorModel { return -10.0*Math.log10(prob); } - public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read, - VariantContext vc, int eventLength) { + public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { long numStartClippedBases = 0; long numEndClippedBases = 0; @@ -419,8 +418,7 @@ public class HaplotypeIndelErrorModel { } - public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, - VariantContext vc, int eventLength){ + public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC){ double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; int i=0; @@ -429,7 +427,7 @@ public class HaplotypeIndelErrorModel { // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent for (int j=0; j < haplotypesInVC.size(); j++) { - readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength); + readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read); if (DEBUG) { System.out.print(read.getReadName()+" "); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index 85c505935..675cef5ff 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -29,6 +29,7 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -95,22 +96,35 @@ public class Haplotype { return isReference; } - public static List makeHaplotypeListFromVariantContextAlleles(VariantContext vc, ReferenceContext ref, final int haplotypeSize) { + public static List makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, final int haplotypeSize) { List haplotypeList = new ArrayList(); + Allele refAllele = null; + + for (Allele a:alleleList) { + if (a.isReference()) { + refAllele = a; + break; + } + + } + + if (refAllele == null) + throw new ReviewedStingException("BUG: no ref alleles in input to makeHaplotypeListfrom Alleles at loc: "+ startPos); + byte[] refBases = ref.getBases(); int numPrefBases = LEFT_WINDOW_SIZE; - int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart()); + int startIdxInReference = (int)(1+startPos-numPrefBases-ref.getWindow().getStart()); //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases); byte[] basesAfterVariant = Arrays.copyOfRange(refBases, - startIdxInReference+numPrefBases+vc.getReference().getBases().length, refBases.length); + startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length); // Create location for all haplotypes @@ -120,7 +134,7 @@ public class Haplotype { GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); - for (Allele a : vc.getAlleles()) { + for (Allele a : alleleList) { byte[] alleleBases = a.getBases(); // use string concatenation