From d3baa4b8cac086fb433b2e408e1165713b9ff0b7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Jan 2013 11:36:20 -0500 Subject: [PATCH] Have Haplotype extend the Allele class. This way, we don't need to create a new Allele for every read/Haplotype pair to be placed in the PerReadAlleleLikelihoodMap (very inefficient). Also, now we can easily get the Haplotype associated with the best allele for a given read. --- .../haplotypecaller/HaplotypeCaller.java | 87 ++++++++++++------- .../LikelihoodCalculationEngine.java | 3 +- .../broadinstitute/sting/utils/Haplotype.java | 64 +++++++------- .../variant/variantcontext/Allele.java | 4 +- 4 files changed, 93 insertions(+), 65 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 00db62bff..04da91f65 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import com.sun.corba.se.impl.logging.UtilSystemException; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -155,7 +154,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected StingSAMFileWriter bamWriter = null; private SAMFileHeader bamHeader = null; private long uniqueNameCounter = 1; - private final String readGroupId = "ArtificialHaplotype"; + private final static String readGroupId = "ArtificialHaplotype"; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. @@ -338,20 +337,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); - if ( bamWriter != null ) { - // prepare the bam header - bamHeader = new SAMFileHeader(); - bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); - final List readGroups = new ArrayList(1); - final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); - rg.setSample("HC"); - rg.setSequencingCenter("BI"); - readGroups.add(rg); - bamHeader.setReadGroups(readGroups); - bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - bamWriter.writeHeader(bamHeader); - bamWriter.setPresorted(true); - } + if ( bamWriter != null ) + setupBamWriter(); } //--------------------------------------------------------------------------------------------------------------- @@ -461,8 +448,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails - final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region - referenceHaplotype.setIsReference(true); + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype ); @@ -498,22 +484,19 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } if ( bamWriter != null ) { + // sort the haplotypes in coordinate order and then write them to the bam Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() ); final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); - for ( Haplotype haplotype : haplotypes ) { - // TODO -- clean up this code - final GATKSAMRecord record = new GATKSAMRecord(bamHeader); - record.setReadBases(haplotype.getBases()); - record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); - record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); - record.setCigar(haplotype.getCigar()); - record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0); - record.setReadName("HC" + uniqueNameCounter++); - record.setReadUnmappedFlag(false); - record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex()); - record.setAttribute(SAMTag.RG.toString(), readGroupId); - record.setFlags(16); - bamWriter.addAlignment(record); + for ( Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); + + // now, output the interesting reads for each sample + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), (Haplotype)bestAllele); + } } } @@ -608,6 +591,46 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } return returnMap; + } + + private void setupBamWriter() { + // prepare the bam header + bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + // include the original read groups plus a new artificial one for the haplotypes + final List readGroups = new ArrayList(getToolkit().getSAMFileHeader().getReadGroups()); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + + bamWriter.writeHeader(bamHeader); + bamWriter.setPresorted(true); + } + + private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(haplotype.getCigar()); + record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(paddedRefLoc.getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), readGroupId); + record.setFlags(16); + bamWriter.addAlignment(record); + } + + private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype) { + + + + } /* diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 8b844817d..e05ad85a9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -138,6 +138,7 @@ public class LikelihoodCalculationEngine { readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated + // TODO -- why is Q18 hard-coded here??? readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); } @@ -151,7 +152,7 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()), + perReadAlleleLikelihoodMap.add(read, haplotype, pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 2706f2f99..4830bf053 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -37,59 +37,71 @@ import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.Serializable; import java.util.*; -public class Haplotype { - protected final byte[] bases; +public class Haplotype extends Allele { protected final double[] quals; private GenomeLoc genomeLocation = null; private HashMap eventMap = null; - private boolean isRef = false; private Cigar cigar; private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; private Event artificialEvent = null; + /** + * Main constructor + * + * @param bases bases + * @param quals quals + * @param isRef is reference allele? + */ + public Haplotype( final byte[] bases, final double[] quals, final boolean isRef ) { + super(bases.clone(), isRef); + this.quals = quals.clone(); + } + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * * @param bases bases * @param qual qual */ - public Haplotype( final byte[] bases, final int qual ) { - this.bases = bases.clone(); + public Haplotype( final byte[] bases, final int qual, final boolean isRef ) { + super(bases.clone(), isRef); quals = new double[bases.length]; Arrays.fill(quals, (double)qual); } + public Haplotype( final byte[] bases, final int qual ) { + this(bases, qual, false); + } + + public Haplotype( final byte[] bases, final boolean isRef ) { + this(bases, 0, isRef); + } + public Haplotype( final byte[] bases, final double[] quals ) { - this.bases = bases.clone(); - this.quals = quals.clone(); + this(bases, quals, false); } public Haplotype( final byte[] bases ) { - this(bases, 0); + this(bases, 0, false); } protected Haplotype( final byte[] bases, final Event artificialEvent ) { - this(bases, 0); + this(bases, 0, false); this.artificialEvent = artificialEvent; } public Haplotype( final byte[] bases, final GenomeLoc loc ) { - this(bases); + this(bases, 0, false); this.genomeLocation = loc; } @Override public boolean equals( Object h ) { - return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases); + return h instanceof Haplotype && super.equals(h); } - @Override - public int hashCode() { - return Arrays.hashCode(bases); - } - public HashMap getEventMap() { return eventMap; } @@ -98,17 +110,9 @@ public class Haplotype { this.eventMap = eventMap; } - public boolean isReference() { - return isRef; - } - - public void setIsReference( boolean isRef ) { - this.isRef = isRef; - } - public double getQualitySum() { double s = 0; - for (int k=0; k < bases.length; k++) { + for (int k=0; k < quals.length; k++) { s += quals[k]; } return s; @@ -116,14 +120,14 @@ public class Haplotype { @Override public String toString() { - return new String(bases); + return getDisplayString(); } public double[] getQuals() { return quals.clone(); } public byte[] getBases() { - return bases.clone(); + return super.getBases().clone(); } public long getStartPosition() { @@ -178,13 +182,13 @@ public class Haplotype { public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); - if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype + if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= getBases().length ) { // desired change falls inside deletion so don't bother creating a new haplotype return null; } byte[] newHaplotypeBases = new byte[]{}; - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), haplotypeInsertLocation + refAllele.length(), getBases().length)); // bases after the variant return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation)); } diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java index 33bca1a8a..0a0b4d0b7 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java @@ -111,7 +111,7 @@ public class Allele implements Comparable { /** A generic static NO_CALL allele for use */ // no public way to create an allele - private Allele(byte[] bases, boolean isRef) { + protected Allele(byte[] bases, boolean isRef) { // null alleles are no longer allowed if ( wouldBeNullAllele(bases) ) { throw new IllegalArgumentException("Null alleles are not supported"); @@ -140,7 +140,7 @@ public class Allele implements Comparable { throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'"); } - private Allele(String bases, boolean isRef) { + protected Allele(String bases, boolean isRef) { this(bases.getBytes(), isRef); }