diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index f8924bed3..ecb4a5b90 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -89,7 +89,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param ref reference context * @param contexts stratified alignment contexts * @param contextType stratified context type - * @param alternateAllelesToUse the alternate allele to use, null if not set + * @param allAllelesToUse the alternate allele to use, null if not set * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @param locParser Genome Loc Parser * @return variant context where genotypes are no-called but with GLs @@ -98,7 +98,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final List alternateAllelesToUse, + final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index a5483ebba..ef5438789 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -35,7 +35,6 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; -import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -44,9 +43,7 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - private final int HAPLOTYPE_SIZE; - - private final boolean getAlleleListFromVCF; + private static final int HAPLOTYPE_SIZE = 80; private boolean DEBUG = false; private boolean ignoreSNPAllelesWhenGenotypingIndels = false; @@ -75,17 +72,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; } - protected List computeConsensusAlleles(ReferenceContext ref, + protected static List computeConsensusAlleles(ReferenceContext ref, Map contexts, AlignmentContextUtils.ReadOrientation contextType, - GenomeLocParser locParser) { + GenomeLocParser locParser, UnifiedArgumentCollection UAC) { ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); return counter.computeConsensusAlleles(ref, contexts, contextType); } @@ -96,7 +91,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final List alternateAllelesToUse, + final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser) { @@ -104,95 +99,26 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return null; GenomeLoc loc = ref.getLocus(); - Allele refAllele, altAllele; - VariantContext vc = null; - - boolean allelesArePadded = true; - if (!ref.getLocus().equals(lastSiteVisited)) { // starting a new site: clear allele list - alleleList.clear(); lastSiteVisited = ref.getLocus(); indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); - if (getAlleleListFromVCF) { - for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { - if (vc_input != null && - allowableTypes.contains(vc_input.getType()) && - ref.getLocus().getStart() == vc_input.getStart()) { - vc = vc_input; - break; - } - } - // ignore places where we don't have a variant - if (vc == null) - return null; - - alleleList.clear(); - if (ignoreSNPAllelesWhenGenotypingIndels) { - // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it - for (Allele a : vc.getAlleles()) - if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) - continue; - else - alleleList.add(a); - - } else { - for (Allele a : vc.getAlleles()) - alleleList.add(a); - } - if (vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1) - allelesArePadded = false; - - } else { - alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser); - if (alleleList.isEmpty()) - return null; - } + alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); + if (alleleList.isEmpty()) + return null; } - // protect against having an indel too close to the edge of a contig - if (loc.getStart() <= HAPLOTYPE_SIZE) + getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); + if (haplotypeMap == null || haplotypeMap.isEmpty()) return null; - // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) - if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) - return null; - - if (alleleList.isEmpty()) - return null; - - refAllele = alleleList.get(0); - altAllele = alleleList.get(1); - - // look for alt allele that has biggest length distance to ref allele - int maxLenDiff = 0; - for (Allele a : alleleList) { - if (a.isNonReference()) { - int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); - if (lenDiff > maxLenDiff) { - maxLenDiff = lenDiff; - altAllele = a; - } - } - } - - final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; - final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; - - if (hsize <= 0) { - logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString())); - return null; - } - haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), - ref, hsize, numPrefBases); - // start making the VariantContext // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base. - int endLoc = loc.getStart() + refAllele.length()-1; - if (allelesArePadded) - endLoc++; + + final int endLoc = computeEndLocation(alleleList, loc); + final int eventLength = getEventLength(alleleList); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase()); // create the genotypes; no-call everyone for now @@ -209,10 +135,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (context.hasBasePileup()) { final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { - final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); - HashMap attributes = new HashMap(); + final HashMap attributes = new HashMap(); attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); @@ -234,6 +160,103 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return indelLikelihoodMap.get(); } + public static int computeEndLocation(final List alleles, final GenomeLoc loc) { + Allele refAllele = alleles.get(0); + boolean allelesArePadded = true; + int endLoc = loc.getStart() + refAllele.length()-1; +/* if (refAllele.getBases().length == vc.getEnd()-vc.getStart()+1) + allelesArePadded = false; + */ + // todo FIXME! + if (allelesArePadded) + endLoc++; + + return endLoc; + } + + public static void getHaplotypeMapFromAlleles(final List alleleList, + final ReferenceContext ref, + final GenomeLoc loc, + final LinkedHashMap haplotypeMap) { + // protect against having an indel too close to the edge of a contig + if (loc.getStart() <= HAPLOTYPE_SIZE) + haplotypeMap.clear(); + // check if there is enough reference window to create haplotypes (can be an issue at end of contigs) + else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE) + haplotypeMap.clear(); + else if (alleleList.isEmpty()) + haplotypeMap.clear(); + else { + final int eventLength = getEventLength(alleleList); + final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1; + final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1; + + haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), + ref, hsize, numPrefBases)); + } + } + + public static int getEventLength(List alleleList) { + Allele refAllele = alleleList.get(0); + Allele altAllele = alleleList.get(1); + // look for alt allele that has biggest length distance to ref allele + int maxLenDiff = 0; + for (Allele a : alleleList) { + if (a.isNonReference()) { + int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); + if (lenDiff > maxLenDiff) { + maxLenDiff = lenDiff; + altAllele = a; + } + } + } + + return altAllele.getBaseString().length() - refAllele.getBaseString().length(); + + } + + public static List getInitialAlleleList(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenomeLocParser locParser, + final UnifiedArgumentCollection UAC, + final boolean ignoreSNPAllelesWhenGenotypingIndels) { + + List alleles = new ArrayList(); + if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { + VariantContext vc = null; + for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) { + if (vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { + vc = vc_input; + break; + } + } + // ignore places where we don't have a variant + if (vc == null) + return alleles; + + + if (ignoreSNPAllelesWhenGenotypingIndels) { + // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it + for (Allele a : vc.getAlleles()) + if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length) + continue; + else + alleles.add(a); + + } else { + alleles.addAll(vc.getAlleles()); + } + + } else { + alleles = IndelGenotypeLikelihoodsCalculationModel.computeConsensusAlleles(ref, contexts, contextType, locParser, UAC); + } + return alleles; + } + // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, // so that per-sample DP will include deletions covering the event. protected int getFilteredDepth(ReadBackedPileup pileup) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index c55e83a56..f9892f125 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final List alternateAllelesToUse, + final List allAllelesToUse, final boolean useBAQedPileup, final GenomeLocParser locParser) { @@ -70,11 +70,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); final Allele refAllele = Allele.create(refBase, true); - // start making the VariantContext - final GenomeLoc loc = ref.getLocus(); - final List alleles = new ArrayList(); - alleles.add(refAllele); - final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); // calculate the GLs ArrayList GLs = new ArrayList(contexts.size()); @@ -90,9 +85,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); } + // start making the VariantContext + final GenomeLoc loc = ref.getLocus(); + final List alleles = new ArrayList(); + alleles.add(refAllele); + + + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); // find the alternate allele(s) that we should be using - if ( alternateAllelesToUse != null ) { - alleles.addAll(alternateAllelesToUse); + if ( allAllelesToUse != null ) { + alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele } else if ( useAlleleFromVCF ) { final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index a1d02b27f..7ab556690 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -378,10 +378,10 @@ public class UnifiedGenotyperEngine { double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - List alternateAllelesToUse = builder.make().getAlternateAlleles(); + List allAllelesToUse = builder.make().getAlleles(); // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model); AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); @@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine { //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model); AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 08bd51134..3ac09d2a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -117,7 +117,7 @@ public class PairHMMIndelErrorModel { } - static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { + static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG // 001000012345000000 @@ -164,10 +164,24 @@ public class PairHMMIndelErrorModel { } } } - public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ + + + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ final int numHaplotypes = haplotypeMap.size(); - final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes]; + final int readCounts[] = new int[pileup.getNumberOfElements()]; + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts); + return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + + } + + public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final HashMap> indelLikelihoodMap, + final int[] readCounts) { + final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; @@ -367,7 +381,7 @@ public class PairHMMIndelErrorModel { } - return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + return readLikelihoods; } private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { @@ -385,7 +399,7 @@ public class PairHMMIndelErrorModel { return b1.length; } - private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix