From 78a4e7e45e3c2122a3ccad37bfa05063333b22dc Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 14 Mar 2012 12:05:05 -0400 Subject: [PATCH] Major restructuring of HaplotypeCaller's LikelihoodCalculationEngine and GenotypingEngine. We no longer create an ugly event dictionary and genotype events found on haplotypes independently by finding the haplotype with the max likelihood. Lots of code has been rewritten to be much cleaner. --- .../broadinstitute/sting/utils/Haplotype.java | 35 +++++++++++-------- .../variantcontext/VariantContextBuilder.java | 16 +++++++++ .../variantcontext/VariantContextUtils.java | 33 ++++++++--------- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 085794bab..c42742627 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -32,16 +32,13 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; +import java.util.*; public class Haplotype { protected final byte[] bases; protected final double[] quals; private GenomeLoc genomeLocation = null; - private boolean isReference = false; + private HashMap readLikelihoodsPerSample = null; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -69,16 +66,26 @@ public class Haplotype { this.genomeLocation = loc; } - public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) { - this(bases, loc); - this.isReference = isRef; - } - @Override public boolean equals( Object h ) { return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases); } + public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) { + if( readLikelihoodsPerSample == null ) { + readLikelihoodsPerSample = new HashMap(); + } + readLikelihoodsPerSample.put(sample, readLikelihoods); + } + + public double[] getReadLikelihoods( final String sample ) { + return readLikelihoodsPerSample.get(sample); + } + + public Set getSampleKeySet() { + return readLikelihoodsPerSample.keySet(); + } + public double getQualitySum() { double s = 0; for (int k=0; k < bases.length; k++) { @@ -87,6 +94,7 @@ public class Haplotype { return s; } + @Override public String toString() { String returnString = ""; for(int iii = 0; iii < bases.length; iii++) { @@ -110,10 +118,6 @@ public class Haplotype { return genomeLocation.getStop(); } - public boolean isReference() { - return isReference; - } - @Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"}) public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) { @@ -208,13 +212,14 @@ public class Haplotype { String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant); haplotypeString = haplotypeString.substring(0,haplotypeSize); - haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference())); + haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus)); } return haplotypeMap; } + // BUGBUG: copied from ReadClipper and slightly modified since we don't have the data in a GATKSAMRecord private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) { int readBases = 0; int refBases = 0; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index 4e16db482..ff66162c8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -28,6 +28,7 @@ import com.google.java.contract.*; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -344,6 +345,21 @@ public class VariantContextBuilder { return this; } + /** + * Tells us that the resulting VariantContext should have the specified location + * @param loc + * @return + */ + @Requires({"loc.getContig() != null", "loc.getStart() >= 0", "loc.getStop() >= 0"}) + public VariantContextBuilder loc(final GenomeLoc loc) { + this.contig = loc.getContig(); + this.start = loc.getStart(); + this.stop = loc.getStop(); + toValidate.add(VariantContext.Validation.ALLELES); + toValidate.add(VariantContext.Validation.REF_PADDING); + return this; + } + /** * Tells us that the resulting VariantContext should have the specified contig chr * @param contig diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index fc50df3a5..e9a12ff26 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -458,7 +458,7 @@ public class VariantContextUtils { /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * * @param genomeLocParser loc parser @@ -492,11 +492,11 @@ public class VariantContextUtils { if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) verifyUniqueSampleNames(unsortedVCs); - List prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + final List prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary - List VCs = new ArrayList(); + final List VCs = new ArrayList(); - for (VariantContext vc : prepaddedVCs) { + for (final VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) VCs.add(createVariantContextWithPaddedAlleles(vc, false)); @@ -531,7 +531,7 @@ public class VariantContextUtils { // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( VariantContext vc : VCs ) { + for ( final VariantContext vc : VCs ) { if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) ) throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); @@ -581,13 +581,13 @@ public class VariantContextUtils { } } - for (Map.Entry p : vc.getAttributes().entrySet()) { + for (final Map.Entry p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if ( ! inconsistentAttributes.contains(key) ) { boolean alreadyFound = attributes.containsKey(key); Object boundValue = attributes.get(key); - boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { // we found the value but we're inconsistent, put it in the exclude list @@ -604,7 +604,7 @@ public class VariantContextUtils { // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF - for ( VariantContext vc : VCs ) { + for ( final VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { @@ -634,11 +634,11 @@ public class VariantContextUtils { setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference + else if ( variantSources.isEmpty() ) // everyone was reference setValue = MERGE_REF_IN_ALL; else { - LinkedHashSet s = new LinkedHashSet(); - for ( VariantContext vc : VCs ) + final LinkedHashSet s = new LinkedHashSet(); + for ( final VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); setValue = Utils.join("-", s); @@ -663,7 +663,7 @@ public class VariantContextUtils { builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary - VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); + final VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make()); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } @@ -724,7 +724,7 @@ public class VariantContextUtils { Map originalToTrimmedAlleleMap = new HashMap(); - for (Allele a : inputVC.getAlleles()) { + for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); @@ -741,11 +741,9 @@ public class VariantContextUtils { // example: mixed records such as {TA*,TGA,TG} boolean hasNullAlleles = false; - for (Allele a: originalToTrimmedAlleleMap.values()) { + for (final Allele a: originalToTrimmedAlleleMap.values()) { if (a.isNull()) hasNullAlleles = true; - if (a.isReference()) - refAllele = a; } if (!hasNullAlleles) @@ -755,7 +753,7 @@ public class VariantContextUtils { List originalAlleles = genotype.getAlleles(); List trimmedAlleles = new ArrayList(); - for ( Allele a : originalAlleles ) { + for ( final Allele a : originalAlleles ) { if ( a.isCalled() ) trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); else @@ -837,7 +835,6 @@ public class VariantContextUtils { public AlleleMapper(Map map) { this.map = map; } public boolean needsRemapping() { return this.map != null; } public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List remap(List as) {