Another major redo for indel genotyper: this time, add ability to do allele and variant discovery, and don't rely necessarily on external vcf's to provide candidate variants and alleles (e.g. by using IndelGenotyperV2). This has two major advantages: speed, and more fine-grained control of discovery process. Code is still under test and analysis but this version should be hopefully stable.

Ability to genotype candidate variants from input vcf is retained and can be turned on by command line argument but is disabled by default. Code, by default, will build a consensus of the most common indel event at a pileup. If that consensus allele has a count bigger than N (=5 by default), we proceed to genotype by computing probabilistic realigmment, AF distribution etc. and possibly emmiting a call. Needed for this, also added ability to build haplotypes from list of alleles instead of from a variant context. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4893 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-22 02:38:06 +00:00 · 2010-12-22 02:38:06 +00:00 · a1653f0c83
parent cd4883f4da
commit a1653f0c83
4 changed files with 254 additions and 49 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java
@ -25,12 +25,19 @@

 package org.broadinstitute.sting.gatk.walkers.genotyper;

+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.SAMRecord;
 import org.apache.log4j.Logger;
 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.StingException;
 import org.broadinstitute.sting.utils.genotype.Haplotype;
+import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -46,71 +53,251 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
    private final double alphaDeletionProbability = 1e-3;
    private final int HAPLOTYPE_SIZE = 80;

+    private final int minIndelCountForGenotyping;
+    private final boolean getAlleleListFromVCF;
    // todo - the following  need to be exposed for command line argument control
    private final double indelHeterozygosity = 1.0/8000;
    boolean useFlatPriors = true;
-    boolean DEBUGOUT = false;
-
+    private static final boolean DEBUGOUT = false;
+    private static final boolean DEBUG = false;
    private HaplotypeIndelErrorModel model;
-    private ArrayList<Integer> sitesVisited;
-
+    private GenomeLoc lastSiteVisited;
+    private List<Allele> alleleList;
    protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
        super(UAC, logger);
        model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability,
                insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, false, DEBUGOUT);
-        sitesVisited = new  ArrayList<Integer>();
+        alleleList = new ArrayList<Allele>();
+        getAlleleListFromVCF = UAC.GET_ALLELES_FROM_VCF;
+        minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
    }

+
+    private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
+                                                      Map<String, StratifiedAlignmentContext> contexts,
+                                                      StratifiedAlignmentContext.StratifiedContextType contextType) {
+        Allele refAllele, altAllele;
+        GenomeLoc loc = ref.getLocus();
+        ArrayList<Allele> aList = new ArrayList<Allele>();
+
+        if (DEBUG) {
+            System.out.println("'''''''''''''''''''''");
+            System.out.println("Loc:"+loc.toString());
+        }
+        HashMap<String,Integer> consensusIndelStrings = new HashMap<String,Integer>();
+
+        int insCount = 0, delCount = 0;
+        // quick check of total number of indels in pileup
+        for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
+            AlignmentContext context = sample.getValue().getContext(contextType);
+
+            final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
+            insCount += indelPileup.getNumberOfInsertions();
+            delCount += indelPileup.getNumberOfDeletions();
+        }
+
+        if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
+            return aList;
+        
+        for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
+            AlignmentContext context = sample.getValue().getContext(contextType);
+
+            final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
+
+
+
+
+            for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
+                SAMRecord read = p.getRead();
+
+                if (DEBUG && p.isIndel()) {
+                    System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
+                            read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
+                            p.getEventLength(),p.getType().toString(), p.getEventBases());
+                }
+
+
+                String indelString = p.getEventBases();
+                if (p.isInsertion()) {
+                    boolean foundKey = false;
+                    if (read.getAlignmentEnd() == loc.getStart()) {
+                        // first corner condition: a read has an insertion at the end, and we're right at the insertion.
+                        // In this case, the read could have any of the inserted bases and we need to build a consensus
+                        for (String s : consensusIndelStrings.keySet()) {
+                            int cnt = consensusIndelStrings.get(s);
+                            if (s.startsWith(indelString)){
+                                // case 1: current insertion is prefix of indel in hash map
+                                consensusIndelStrings.put(s,cnt+1);
+                                foundKey = true;
+                                break;
+                            }
+                            else if (indelString.startsWith(s)) {
+                                // case 2: indel stored in hash table is prefix of current insertion
+                                // In this case, new bases are new key.
+                                consensusIndelStrings.remove(s);
+                                consensusIndelStrings.put(indelString,cnt+1);
+                                foundKey = true;
+                                break;
+                            }
+                        }
+                        if (!foundKey)
+                            // none of the above: event bases not supported by previous table, so add new key
+                            consensusIndelStrings.put(indelString,1);
+
+                    }
+                    else if (read.getAlignmentStart() == loc.getStart()+1) {
+                        // opposite corner condition: read will start at current locus with an insertion
+                        for (String s : consensusIndelStrings.keySet()) {
+                            int cnt = consensusIndelStrings.get(s);
+                            if (s.endsWith(indelString)){
+                                // case 1: current insertion is suffix of indel in hash map
+                                consensusIndelStrings.put(s,cnt+1);
+                                foundKey = true;
+                                break;
+                            }
+                            else if (indelString.endsWith(s)) {
+                                // case 2: indel stored in hash table is suffix of current insertion
+                                // In this case, new bases are new key.
+
+                                consensusIndelStrings.remove(s);
+                                consensusIndelStrings.put(indelString,cnt+1);
+                                foundKey = true;
+                                break;
+                            }
+                        }
+                        if (!foundKey)
+                            // none of the above: event bases not supported by previous table, so add new key
+                            consensusIndelStrings.put(indelString,1);
+
+                    }
+                    else {
+                        // normal case: insertion somewhere in the middle of a read: add count to hash map
+                        int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
+                        consensusIndelStrings.put(indelString,cnt+1);
+                    }
+
+                }
+                else if (p.isDeletion()) {
+                    indelString = String.format("D%d",p.getEventLength());
+                    int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
+                    consensusIndelStrings.put(indelString,cnt+1);
+
+                }
+            }
+
+            if (DEBUG) {
+                int icount = indelPileup.getNumberOfInsertions();
+                 int dcount = indelPileup.getNumberOfDeletions();
+                if (icount + dcount > 0)
+                {
+                    List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
+                    System.out.format("#ins: %d, #del:%d\n", insCount, delCount);
+
+                    for (int i=0 ; i < eventStrings.size() ; i++ ) {
+                        System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second);
+                        //                int k=0;
+                    }
+                    System.out.println();
+                }
+            }
+        }
+
+        int maxAlleleCnt = 0;
+        String bestAltAllele = "";
+        for (String s : consensusIndelStrings.keySet()) {
+            int curCnt = consensusIndelStrings.get(s);
+            if (curCnt > maxAlleleCnt) {
+                maxAlleleCnt = curCnt;
+                bestAltAllele = s;
+            }
+            if (DEBUG)
+                System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s)  );
+        }         //gdebug-
+
+        if (maxAlleleCnt <  minIndelCountForGenotyping)
+            return aList;
+
+        if (bestAltAllele.startsWith("D")) {
+            // get deletion length
+            int dLen = Integer.valueOf(bestAltAllele.substring(1));
+            // get ref bases of accurate deletion
+            int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart());
+
+            byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
+            refAllele = Allele.create(refBases,true);
+            altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
+        }
+        else {
+            // insertion case
+            refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
+            altAllele = Allele.create(bestAltAllele, false);
+        }
+
+        aList.add(0,refAllele);
+        aList.add(1,altAllele);
+
+        return aList;
+
+    }
    public Allele getLikelihoods(RefMetaDataTracker tracker,
                                 ReferenceContext ref,
                                 Map<String, StratifiedAlignmentContext> contexts,
                                 StratifiedAlignmentContext.StratifiedContextType contextType,
                                 GenotypePriors priors,
                                 Map<String, BiallelicGenotypeLikelihoods> GLs,
-                                 Allele alternateAlleleToUse) {  // TODO: use this instead of reading the 'indels' ROD from the tracker below                               
+                                 Allele alternateAlleleToUse) {

        if ( tracker == null )
            return null;

+
+        GenomeLoc loc = ref.getLocus();
+        Allele refAllele, altAllele;
        VariantContext vc = null;

-        for( final VariantContext vc_input : tracker.getVariantContexts(ref, "indels", null, ref.getLocus(), false, false) ) {
-            if( vc_input != null && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) {
-                vc = vc_input;
-                break;
+        if (!ref.getLocus().equals(lastSiteVisited)) {
+            // starting a new site: clear allele list
+            alleleList.clear();
+            lastSiteVisited = ref.getLocus().clone();
+
+
+            if (getAlleleListFromVCF) {
+
+                 for( final VariantContext vc_input : tracker.getVariantContexts(ref, "indels", null, ref.getLocus(), false, false) ) {
+                     if( vc_input != null && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) {
+                         vc = vc_input;
+                         break;
+                     }
+                 }
+                 // ignore places where we don't have a variant
+                 if ( vc == null )
+                     return null;
+
+
+                 if (!vc.isIndel())
+                     return null;
+
+                alleleList.clear();
+                for (Allele a : vc.getAlleles())
+                    alleleList.add(a);
+
            }
-        }
-        // ignore places where we don't have a variant
-        if ( vc == null )
-            return null;
-
-
-        if (!vc.isIndel())
-            return null;
-
-        boolean visitedBefore = false;
-        synchronized (this) {
-
-            if (sitesVisited.contains(new Integer(vc.getStart())) &&
-                    contextType.equals(StratifiedAlignmentContext.StratifiedContextType.COMPLETE))
-                 visitedBefore = true;
            else {
-                sitesVisited.add(new Integer(vc.getStart()));
+                alleleList = computeConsensusAlleles(ref,contexts, contextType);
+                if (alleleList.isEmpty())
+                    return null;
            }
        }
-
-        if (visitedBefore)
-                return null;
-
        // protect against having an indel too close to the edge of a contig
-        if (vc.getStart() <= HAPLOTYPE_SIZE)
+        if (loc.getStart() <= HAPLOTYPE_SIZE)
            return null;
-        
+
        if ( !(priors instanceof DiploidIndelGenotypePriors) )
-             throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
+            throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");

-
-        int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length();
+        refAllele = alleleList.get(0);
+        altAllele = alleleList.get(1);
+        int eventLength = refAllele.getBaseString().length() - altAllele.getBaseString().length();
        // assume only one alt allele for now
        if (eventLength<0)
            eventLength = - eventLength;
@ -118,7 +305,8 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
        int currentHaplotypeSize = HAPLOTYPE_SIZE;

        // int numSamples = getNSamples(contexts);
-        List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
+        List<Haplotype> haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
+                ref, currentHaplotypeSize);
        // For each sample, get genotype likelihoods based on pileup
        // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
        // initialize the GenotypeLikelihoods
@ -144,14 +332,14 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
                pileup = context.getBasePileup();

            if (pileup != null ) {
-                haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength);
+                haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);


                double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);

                GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
-                        vc.getReference(),
-                        vc.getAlternateAllele(0),
+                        refAllele,
+                        altAllele,
                        genotypeLikelihoods[0],
                        genotypeLikelihoods[1],
                        genotypeLikelihoods[2],
@ -160,6 +348,6 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
            }
        }

-        return vc.getReference();
+        return refAllele;
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -86,6 +86,11 @@ public class UnifiedArgumentCollection {
    @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
    public Double MAX_DELETION_FRACTION = 0.05;

+    @Argument(fullName = "get_indel_alleles_from_vcf", shortName = "getIndelAllelesFromVCF", doc = "Get reference/alt alleles for indel genotyping from VCF", required = false)
+    public boolean GET_ALLELES_FROM_VCF = false;
+
+    @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false)
+    public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5;

    public UnifiedArgumentCollection clone() {
        UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
@ -132,8 +132,7 @@ public class HaplotypeIndelErrorModel {
        return -10.0*Math.log10(prob);
    }

-    public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read,
-                                                      VariantContext vc, int eventLength) {
+    public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) {

        long numStartClippedBases = 0;
        long numEndClippedBases = 0;
@ -419,8 +418,7 @@ public class HaplotypeIndelErrorModel {

    }

-    public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
-                                                      VariantContext vc, int eventLength){
+    public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC){
        double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
        double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
        int i=0;
@ -429,7 +427,7 @@ public class HaplotypeIndelErrorModel {
            // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
            // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
            for (int j=0; j < haplotypesInVC.size(); j++) {
-                readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength);
+                readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
                if (DEBUG) {
                    System.out.print(read.getReadName()+" ");

--- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
@ -29,6 +29,7 @@ import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

 import java.util.ArrayList;
 import java.util.Arrays;
@ -95,22 +96,35 @@ public class Haplotype {
        return isReference;
    }

-    public static List<Haplotype> makeHaplotypeListFromVariantContextAlleles(VariantContext vc, ReferenceContext ref, final int haplotypeSize) {
+    public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref, final int haplotypeSize) {


        List<Haplotype> haplotypeList = new ArrayList<Haplotype>();

+        Allele refAllele = null;
+
+        for (Allele a:alleleList) {
+            if (a.isReference()) {
+                refAllele = a;
+                break;
+            }
+
+        }
+
+        if (refAllele == null)
+            throw new ReviewedStingException("BUG: no ref alleles in input to makeHaplotypeListfrom Alleles at loc: "+ startPos);
+
        byte[] refBases = ref.getBases();

        int numPrefBases = LEFT_WINDOW_SIZE;

-        int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart());
+        int startIdxInReference = (int)(1+startPos-numPrefBases-ref.getWindow().getStart());
        //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event

 
        byte[] basesBeforeVariant = Arrays.copyOfRange(refBases,startIdxInReference,startIdxInReference+numPrefBases);
        byte[] basesAfterVariant = Arrays.copyOfRange(refBases,
-                startIdxInReference+numPrefBases+vc.getReference().getBases().length, refBases.length);
+                startIdxInReference+numPrefBases+ refAllele.getBases().length, refBases.length);


        // Create location for all haplotypes
@ -120,7 +134,7 @@ public class Haplotype {
        GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc);


-        for (Allele a : vc.getAlleles()) {
+        for (Allele a : alleleList) {

            byte[] alleleBases = a.getBases();
            // use string concatenation