From aee66ab1574cd8f0480a9ffac913cebb7cba3c6d Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 13 Jun 2012 11:14:44 -0400 Subject: [PATCH] Big UG refactoring and intermediate commit to support indels in pool caller (not done yet). Lots of code pulled out of long spaghetti-like functions and modularized to be easily shareable. Add functionality in ErrorModel to count indel matches/mismatches (but left part disabled as not to change integration tests in this commit), add computation of pool genotype likelihoods for indels (not fully working yet in more realistic cases, only working in artificial nice pools). Lot's of TBD's still but existing UG and pool SNP functionality should be intact --- .../GenotypeLikelihoodsCalculationModel.java | 4 +- ...elGenotypeLikelihoodsCalculationModel.java | 203 ++++++++++-------- ...NPGenotypeLikelihoodsCalculationModel.java | 18 +- .../genotyper/UnifiedGenotyperEngine.java | 6 +- .../indels/PairHMMIndelErrorModel.java | 24 ++- 5 files changed, 147 insertions(+), 108 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index f8924bed3..ecb4a5b90 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -89,7 +89,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param ref reference context * @param contexts stratified alignment contexts * @param contextType stratified context type - * @param alternateAllelesToUse the alternate allele to use, null if not set + * @param allAllelesToUse the alternate allele to use, null if not set * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @param locParser Genome Loc Parser * @return variant context where genotypes are no-called but with GLs @@ -98,7 +98,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final List alternateAllelesToUse, + final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index a5483ebba..ef5438789 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -35,7 +35,6 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; -import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -44,9 +43,7 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private final int HAPLOTYPE_SIZE; - - private final boolean getAlleleListFromVCF; + private static final int HAPLOTYPE_SIZE = 80; private boolean DEBUG = false; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; @@ -75,17 +72,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - protected List computeConsensusAlleles(ReferenceContext ref, + protected static List computeConsensusAlleles(ReferenceContext ref, Map contexts, AlignmentContextUtils.ReadOrientation contextType, - GenomeLocParser locParser) { + GenomeLocParser locParser, UnifiedArgumentCollection UAC) { ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); return counter.computeConsensusAlleles(ref, contexts, contextType); } @@ -96,7 +91,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final List alternateAllelesToUse, + final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser) { @@ -104,95 +99,26 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return null; GenomeLoc loc = ref.getLocus(); - Allele refAllele, altAllele; - VariantContext vc = null; - - boolean allelesArePadded = true; - if (!ref.getLocus().equals(lastSiteVisited)) { // starting a new site: clear allele list - alleleList.clear(); lastSiteVisited = ref.getLocus(); indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); - if (getAlleleListFromVCF) { - for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { - if (vc_input != null && - allowableTypes.contains(vc_input.getType()) && - ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; - } - } - // ignore places where we don't have a variant - if (vc == null) - return null; - - alleleList.clear(); - if (ignoreSNPAllelesWhenGenotypingIndels) { - // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it - for (Allele a : vc.getAlleles()) - if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) - continue; - else - alleleList.add(a); - - } else { - for (Allele a : vc.getAlleles()) - alleleList.add(a); - } - if (vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1) - allelesArePadded = false; - - } else { - alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser); - if (alleleList.isEmpty()) - return null; - } + alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); + if (alleleList.isEmpty()) + return null; } - // protect against having an indel too close to the edge of a contig - if (loc.getStart() <= HAPLOTYPE_SIZE) + getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); + if (haplotypeMap == null || haplotypeMap.isEmpty()) return null; - // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) - if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) - return null; - - if (alleleList.isEmpty()) - return null; - - refAllele = alleleList.get(0); - altAllele = alleleList.get(1); - - // look for alt allele that has biggest length distance to ref allele - int maxLenDiff = 0; - for (Allele a : alleleList) { - if (a.isNonReference()) { - int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); - if (lenDiff > maxLenDiff) { - maxLenDiff = lenDiff; - altAllele = a; - } - } - } - - final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; - final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - - if (hsize <= 0) { - logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString())); - return null; - } - haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), - ref, hsize, numPrefBases); - // start making the VariantContext // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base. - int endLoc = loc.getStart() + refAllele.length()-1; - if (allelesArePadded) - endLoc++; + + final int endLoc = computeEndLocation(alleleList, loc); + final int eventLength = getEventLength(alleleList); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase()); // create the genotypes; no-call everyone for now @@ -209,10 +135,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (context.hasBasePileup()) { final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { - final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); - HashMap attributes = new HashMap(); + final HashMap attributes = new HashMap(); attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); @@ -234,6 +160,103 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return indelLikelihoodMap.get(); } + public static int computeEndLocation(final List alleles, final GenomeLoc loc) { + Allele refAllele = alleles.get(0); + boolean allelesArePadded = true; + int endLoc = loc.getStart() + refAllele.length()-1; +/* if (refAllele.getBases().length == vc.getEnd()-vc.getStart()+1) + allelesArePadded = false; + */ + // todo FIXME! + if (allelesArePadded) + endLoc++; + + return endLoc; + } + + public static void getHaplotypeMapFromAlleles(final List alleleList, + final ReferenceContext ref, + final GenomeLoc loc, + final LinkedHashMap haplotypeMap) { + // protect against having an indel too close to the edge of a contig + if (loc.getStart() <= HAPLOTYPE_SIZE) + haplotypeMap.clear(); + // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) + else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) + haplotypeMap.clear(); + else if (alleleList.isEmpty()) + haplotypeMap.clear(); + else { + final int eventLength = getEventLength(alleleList); + final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; + final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; + + haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), + ref, hsize, numPrefBases)); + } + } + + public static int getEventLength(List alleleList) { + Allele refAllele = alleleList.get(0); + Allele altAllele = alleleList.get(1); + // look for alt allele that has biggest length distance to ref allele + int maxLenDiff = 0; + for (Allele a : alleleList) { + if (a.isNonReference()) { + int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); + if (lenDiff > maxLenDiff) { + maxLenDiff = lenDiff; + altAllele = a; + } + } + } + + return altAllele.getBaseString().length() - refAllele.getBaseString().length(); + + } + + public static List getInitialAlleleList(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenomeLocParser locParser, + final UnifiedArgumentCollection UAC, + final boolean ignoreSNPAllelesWhenGenotypingIndels) { + + List alleles = new ArrayList(); + if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { + VariantContext vc = null; + for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) { + if (vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if (vc == null) + return alleles; + + + if (ignoreSNPAllelesWhenGenotypingIndels) { + // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it + for (Allele a : vc.getAlleles()) + if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) + continue; + else + alleles.add(a); + + } else { + alleles.addAll(vc.getAlleles()); + } + + } else { + alleles = IndelGenotypeLikelihoodsCalculationModel.computeConsensusAlleles(ref, contexts, contextType, locParser, UAC); + } + return alleles; + } + // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, // so that per-sample DP will include deletions covering the event. protected int getFilteredDepth(ReadBackedPileup pileup) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index c55e83a56..f9892f125 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final List alternateAllelesToUse, + final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser) { @@ -70,11 +70,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); final Allele refAllele = Allele.create(refBase, true); - // start making the VariantContext - final GenomeLoc loc = ref.getLocus(); - final List alleles = new ArrayList(); - alleles.add(refAllele); - final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); // calculate the GLs ArrayList GLs = new ArrayList(contexts.size()); @@ -90,9 +85,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); } + // start making the VariantContext + final GenomeLoc loc = ref.getLocus(); + final List alleles = new ArrayList(); + alleles.add(refAllele); + + + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); // find the alternate allele(s) that we should be using - if ( alternateAllelesToUse != null ) { - alleles.addAll(alternateAllelesToUse); + if ( allAllelesToUse != null ) { + alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele } else if ( useAlleleFromVCF ) { final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a1d02b27f..7ab556690 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -378,10 +378,10 @@ public class UnifiedGenotyperEngine { double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - List alternateAllelesToUse = builder.make().getAlternateAlleles(); + List allAllelesToUse = builder.make().getAlleles(); // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model); AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); @@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine { //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model); AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 08bd51134..3ac09d2a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -117,7 +117,7 @@ public class PairHMMIndelErrorModel { } - static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -164,10 +164,24 @@ public class PairHMMIndelErrorModel { } } } - public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ + + + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ final int numHaplotypes = haplotypeMap.size(); - final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes]; + final int readCounts[] = new int[pileup.getNumberOfElements()]; + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts); + return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + + } + + public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final HashMap> indelLikelihoodMap, + final int[] readCounts) { + final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; @@ -367,7 +381,7 @@ public class PairHMMIndelErrorModel { } - return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + return readLikelihoods; } private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { @@ -385,7 +399,7 @@ public class PairHMMIndelErrorModel { return b1.length; } - private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix