Big UG refactoring and intermediate commit to support indels in pool caller (not done yet). Lots of code pulled out of long spaghetti-like functions and modularized to be easily shareable. Add functionality in ErrorModel to count indel matches/mismatches (but left part disabled as not to change integration tests in this commit), add computation of pool genotype likelihoods for indels (not fully working yet in more realistic cases, only working in artificial nice pools). Lot's of TBD's still but existing UG and pool SNP functionality should be intact

2012-06-13 11:14:44 -04:00 · 2012-06-13 11:14:44 -04:00 · aee66ab157
parent bb77aa88c3
commit aee66ab157
5 changed files with 147 additions and 108 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
@ -89,7 +89,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
      * @param ref                   reference context
      * @param contexts              stratified alignment contexts
      * @param contextType           stratified context type
-      * @param alternateAllelesToUse the alternate allele to use, null if not set
+      * @param allAllelesToUse the alternate allele to use, null if not set
      * @param useBAQedPileup        should we use the BAQed pileup or the raw one?
      * @param locParser             Genome Loc Parser
      * @return variant context where genotypes are no-called but with GLs
@ -98,7 +98,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
                                                   final ReferenceContext ref,
                                                   final Map<String, AlignmentContext> contexts,
                                                   final AlignmentContextUtils.ReadOrientation contextType,
-                                                   final List<Allele> alternateAllelesToUse,
+                                                   final List<Allele> allAllelesToUse,
                                                   final boolean useBAQedPileup,
                                                   final GenomeLocParser locParser);

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@ -35,7 +35,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.Haplotype;
-import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -44,9 +43,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
 import java.util.*;

 public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
-    private final int HAPLOTYPE_SIZE;
-
-    private final boolean getAlleleListFromVCF;
+    private static final int HAPLOTYPE_SIZE = 80;

    private boolean DEBUG = false;
    private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
@ -75,17 +72,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
        super(UAC, logger);
        pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
                UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
-        getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
-        HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
        DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
        haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
        ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
    }

-    protected List<Allele> computeConsensusAlleles(ReferenceContext ref,
+    protected static List<Allele> computeConsensusAlleles(ReferenceContext ref,
                                                 Map<String, AlignmentContext> contexts,
                                                 AlignmentContextUtils.ReadOrientation contextType,
-                                                 GenomeLocParser locParser) {
+                                                 GenomeLocParser locParser, UnifiedArgumentCollection UAC) {
        ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
        return counter.computeConsensusAlleles(ref, contexts, contextType);
    }
@ -96,7 +91,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
                                         final ReferenceContext ref,
                                         final Map<String, AlignmentContext> contexts,
                                         final AlignmentContextUtils.ReadOrientation contextType,
-                                         final List<Allele> alternateAllelesToUse,
+                                         final List<Allele> allAllelesToUse,
                                         final boolean useBAQedPileup,
                                         final GenomeLocParser locParser) {

@ -104,95 +99,26 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
            return null;

        GenomeLoc loc = ref.getLocus();
-        Allele refAllele, altAllele;
-        VariantContext vc = null;
-
-        boolean allelesArePadded = true;
-
        if (!ref.getLocus().equals(lastSiteVisited)) {
            // starting a new site: clear allele list
-            alleleList.clear();
            lastSiteVisited = ref.getLocus();
            indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
            haplotypeMap.clear();

-            if (getAlleleListFromVCF) {
-                for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) {
-                    if (vc_input != null &&
-                            allowableTypes.contains(vc_input.getType()) &&
-                            ref.getLocus().getStart() == vc_input.getStart()) {
-                        vc = vc_input;
-                        break;
-                    }
-                }
-                // ignore places where we don't have a variant
-                if (vc == null)
-                    return null;
-
-                alleleList.clear();
-                if (ignoreSNPAllelesWhenGenotypingIndels) {
-                    // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
-                    for (Allele a : vc.getAlleles())
-                        if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
-                            continue;
-                        else
-                            alleleList.add(a);
-
-                } else {
-                    for (Allele a : vc.getAlleles())
-                        alleleList.add(a);
-                }
-                if (vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1)
-                    allelesArePadded = false;
-
-            } else {
-                alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
-                if (alleleList.isEmpty())
-                    return null;
-            }
+            alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels);
+            if (alleleList.isEmpty())
+                return null;
        }
-        // protect against having an indel too close to the edge of a contig
-        if (loc.getStart() <= HAPLOTYPE_SIZE)
+        getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap);
+        if (haplotypeMap == null || haplotypeMap.isEmpty())
            return null;

-        // check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
-        if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
-            return null;
-
-        if (alleleList.isEmpty())
-            return null;
-
-        refAllele = alleleList.get(0);
-        altAllele = alleleList.get(1);
-
-        // look for alt allele that has biggest length distance to ref allele
-        int maxLenDiff = 0;
-        for (Allele a : alleleList) {
-            if (a.isNonReference()) {
-                int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
-                if (lenDiff > maxLenDiff) {
-                    maxLenDiff = lenDiff;
-                    altAllele = a;
-                }
-            }
-        }
-
-        final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
-        final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
-        final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
-
-        if (hsize <= 0) {
-            logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString()));
-            return null;
-        }
-        haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
-                ref, hsize, numPrefBases);
-
        // start making the VariantContext
        // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base.
-        int endLoc = loc.getStart() + refAllele.length()-1;
-        if (allelesArePadded)
-            endLoc++;
+        
+        final int endLoc = computeEndLocation(alleleList, loc);
+        final int eventLength = getEventLength(alleleList);
+
        final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase());

        // create the genotypes; no-call everyone for now
@ -209,10 +135,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
            if (context.hasBasePileup()) {
                final ReadBackedPileup pileup = context.getBasePileup();
                if (pileup != null) {
-                    final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
-                    GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
+                    final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
+                    final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);

-                    HashMap<String, Object> attributes = new HashMap<String, Object>();
+                    final HashMap<String, Object> attributes = new HashMap<String, Object>();
                    attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
                    attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
                    genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
@ -234,6 +160,103 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
        return indelLikelihoodMap.get();
    }

+    public static int computeEndLocation(final List<Allele> alleles, final GenomeLoc loc) {
+        Allele refAllele = alleles.get(0);
+        boolean allelesArePadded  = true;
+        int endLoc = loc.getStart() + refAllele.length()-1;
+/*        if (refAllele.getBases().length == vc.getEnd()-vc.getStart()+1)
+            allelesArePadded = false;
+  */
+        // todo FIXME!
+        if (allelesArePadded)
+            endLoc++;
+
+        return endLoc;
+    }
+
+    public static void getHaplotypeMapFromAlleles(final List<Allele> alleleList,
+                                                 final ReferenceContext ref,
+                                                 final GenomeLoc loc,
+                                                 final LinkedHashMap<Allele, Haplotype> haplotypeMap) {
+        // protect against having an indel too close to the edge of a contig
+        if (loc.getStart() <= HAPLOTYPE_SIZE)
+            haplotypeMap.clear();
+        // check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
+        else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
+            haplotypeMap.clear();
+        else if (alleleList.isEmpty())
+            haplotypeMap.clear();
+        else {
+            final int eventLength = getEventLength(alleleList);
+            final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
+            final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
+
+            haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
+                    ref, hsize, numPrefBases));
+        }
+    }
+
+    public static int getEventLength(List<Allele> alleleList) {
+        Allele refAllele = alleleList.get(0);
+        Allele altAllele = alleleList.get(1);
+        // look for alt allele that has biggest length distance to ref allele
+        int maxLenDiff = 0;
+        for (Allele a : alleleList) {
+            if (a.isNonReference()) {
+                int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
+                if (lenDiff > maxLenDiff) {
+                    maxLenDiff = lenDiff;
+                    altAllele = a;
+                }
+            }
+        }
+
+        return altAllele.getBaseString().length() - refAllele.getBaseString().length();
+
+    }
+    
+    public static List<Allele> getInitialAlleleList(final RefMetaDataTracker tracker,
+                                                    final ReferenceContext ref,
+                                                    final Map<String, AlignmentContext> contexts,
+                                                    final AlignmentContextUtils.ReadOrientation contextType,
+                                                    final GenomeLocParser locParser,
+                                                    final UnifiedArgumentCollection UAC,
+                                                    final boolean ignoreSNPAllelesWhenGenotypingIndels) {
+        
+        List<Allele> alleles = new ArrayList<Allele>();
+        if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
+            VariantContext vc = null;
+            for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
+                if (vc_input != null &&
+                        allowableTypes.contains(vc_input.getType()) &&
+                        ref.getLocus().getStart() == vc_input.getStart()) {
+                    vc = vc_input;
+                    break;
+                }
+            }
+            // ignore places where we don't have a variant
+            if (vc == null)
+                return alleles;
+
+
+            if (ignoreSNPAllelesWhenGenotypingIndels) {
+                // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
+                for (Allele a : vc.getAlleles())
+                    if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
+                        continue;
+                    else
+                        alleles.add(a);
+
+            } else {
+                alleles.addAll(vc.getAlleles());
+            }
+
+        } else {
+            alleles = IndelGenotypeLikelihoodsCalculationModel.computeConsensusAlleles(ref, contexts, contextType, locParser, UAC);
+        }
+        return alleles;
+    }
+
    // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
    // so that per-sample DP will include deletions covering the event.
    protected int getFilteredDepth(ReadBackedPileup pileup) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@ -62,7 +62,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
                                         final ReferenceContext ref,
                                         final Map<String, AlignmentContext> contexts,
                                         final AlignmentContextUtils.ReadOrientation contextType,
-                                         final List<Allele> alternateAllelesToUse,
+                                         final List<Allele> allAllelesToUse,
                                         final boolean useBAQedPileup,
                                         final GenomeLocParser locParser) {

@ -70,11 +70,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
        final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
        final Allele refAllele = Allele.create(refBase, true);

-        // start making the VariantContext
-        final GenomeLoc loc = ref.getLocus();
-        final List<Allele> alleles = new ArrayList<Allele>();
-        alleles.add(refAllele);
-        final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);

        // calculate the GLs
        ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
@ -90,9 +85,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
                GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
        }

+        // start making the VariantContext
+        final GenomeLoc loc = ref.getLocus();
+        final List<Allele> alleles = new ArrayList<Allele>();
+        alleles.add(refAllele);
+
+
+        final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
        // find the alternate allele(s) that we should be using
-        if ( alternateAllelesToUse != null ) {
-            alleles.addAll(alternateAllelesToUse);
+        if ( allAllelesToUse != null ) {
+            alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size()));   // this includes ref allele
        } else if ( useAlleleFromVCF ) {
            final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -378,10 +378,10 @@ public class UnifiedGenotyperEngine {
            double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
            //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);

-            List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
+            List<Allele> allAllelesToUse = builder.make().getAlleles();
            
            // the forward lod
-            VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
+            VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model);
            AFresult.reset();
            afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
            //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine {
            //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);

            // the reverse lod
-            VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
+            VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model);
            AFresult.reset();
            afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
            //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -117,7 +117,7 @@ public class PairHMMIndelErrorModel {

    }

-     static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
+    static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
        // compute forward hrun length, example:
        // AGGTGACCCCCCTGAGAG
        // 001000012345000000
@ -164,10 +164,24 @@ public class PairHMMIndelErrorModel {
            }
        }
    }
-    public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
+
+
+    public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele, Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap){
        final int numHaplotypes = haplotypeMap.size();
-        final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
+
        final int readCounts[] = new int[pileup.getNumberOfElements()];
+        final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts);
+        return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
+        
+    }
+
+    public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, 
+                                                                          final LinkedHashMap<Allele, Haplotype> haplotypeMap, 
+                                                                          final ReferenceContext ref,
+                                                                          final int eventLength, 
+                                                                          final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap,
+                                                                          final int[] readCounts) {
+        final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
        final PairHMM pairHMM = new PairHMM(bandedLikelihoods);

        int readIdx=0;
@ -367,7 +381,7 @@ public class PairHMMIndelErrorModel {

        }

-        return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
+        return readLikelihoods;
    }

    private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
@ -385,7 +399,7 @@ public class PairHMMIndelErrorModel {
        return b1.length;
    }

-    private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
+    private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
        final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

        // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix