From 0127799cba7cab121550d33d41bab2283bec4092 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 11 Jun 2014 15:45:45 -0400 Subject: [PATCH] Reads are now realigned to the most likely haplotype before being used by the annotations. -- AD,DP will now correspond directly to the reads that were used to construct the PLs -- RankSumTests, etc. will use the bases from the realigned reads instead of the original alignments -- There is now no additional runtime cost to realign the reads when using bamout or GVCF mode -- bamout mode no longer sets the mapping quality to zero for uninformative reads, instead the read will not be given an HC tag --- .../haplotypecaller/HaplotypeCaller.java | 10 +- .../HaplotypeCallerGenotypingEngine.java | 21 ++-- .../PairHMMLikelihoodCalculationEngine.java | 20 +--- .../ReferenceConfidenceModel.java | 22 +--- .../AllHaplotypeBAMWriter.java | 11 +- .../CalledHaplotypeBAMWriter.java | 17 +-- .../HaplotypeBAMWriter.java | 105 +----------------- ...LikelihoodCalculationEnginesBenchmark.java | 2 +- ...lexAndSymbolicVariantsIntegrationTest.java | 8 +- .../HaplotypeCallerGVCFIntegrationTest.java | 14 +-- .../HaplotypeCallerIntegrationTest.java | 38 +++---- ...MMLikelihoodCalculationEngineUnitTest.java | 2 +- ...ngLikelihoodCalculationEngineUnitTest.java | 2 +- .../HaplotypeBAMWriterUnitTest.java | 7 +- .../gatk/utils/GenomeLocParser.java | 31 +++++- .../gatk/utils/clipping/ClippingOp.java | 23 +--- .../gatk/utils/duplicates/DupUtils.java | 6 +- .../genotyper/PerReadAlleleLikelihoodMap.java | 27 ++++- .../gatk/utils/sam/AlignmentUtils.java | 65 ++++++++++- .../gatk/utils/sam/GATKSAMRecord.java | 21 ++-- .../utils/sam/AlignmentUtilsUnitTest.java | 2 +- 21 files changed, 197 insertions(+), 257 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 6ae18b29b..6f2991771 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -692,7 +692,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine() { switch (likelihoodEngineImplementation) { case PairHMM: - return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, SCAC.DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); + return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); case GraphBased: return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,SCAC.DEBUG,debugGraphTransformations); case Random: @@ -839,8 +839,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); // Calculate the likelihoods: CPU intesive part. - final Map stratifiedReadMap = - likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); + final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); + + // Realign all the reads to the most likely haplotype for use by the annotations + for( final Map.Entry entry : stratifiedReadMap.entrySet() ) { + entry.getValue().realignReadsToMostLikelyHaplotype(haplotypes, assemblyResult.getPaddedReferenceLoc()); + } // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there // was a bad interaction between that selection and the marginalization that happens over each event when computing diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 86548796e..7fe3de973 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -254,12 +254,12 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine alleleReadMap_annotations = ( configuration.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap_annotations); - final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); + final Map stratifiedReadMap = addFilteredReadList(genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call, true); VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); @@ -285,7 +285,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine allele * @param stratifiedReadMap target per-read-allele-likelihood-map. */ - public static Map addMiscellaneousAllele(final Map stratifiedReadMap) { + public static Map addMiscellaneousAllele(final Map stratifiedReadMap) { final Allele miscellanoeusAllele = GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE; for (Map.Entry perSample : stratifiedReadMap.entrySet()) { for (Map.Entry> perRead : perSample.getValue().getLikelihoodReadMap().entrySet()) { @@ -430,19 +430,20 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine filterToOnlyOverlappingReads( final GenomeLocParser parser, - final Map perSampleReadMap, - final Map> perSampleFilteredReadList, - final VariantContext call ) { + private static Map addFilteredReadList(final GenomeLocParser parser, + final Map perSampleReadMap, + final Map> perSampleFilteredReadList, + final VariantContext call, + final boolean requireOverlap) { final Map returnMap = new LinkedHashMap<>(); - final GenomeLoc callLoc = parser.createGenomeLoc(call); + final GenomeLoc callLoc = ( requireOverlap ? parser.createGenomeLoc(call) : null ); for( final Map.Entry sample : perSampleReadMap.entrySet() ) { final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( callLoc.overlapsP(parser.createGenomeLoc(mapEntry.getKey())) ) { // BUGBUG: This uses alignment start and stop, NOT soft start and soft end... + if( !requireOverlap || callLoc.overlapsP(parser.createGenomeLocUnclipped(mapEntry.getKey())) ) { for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) { likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue()); } @@ -452,7 +453,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - - // Add likelihoods for each sample's reads to our stratifiedReadMap - final Map stratifiedReadMap = new LinkedHashMap<>(); - for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { - // evaluate the likelihood of the reads given those haplotypes - final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - - map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); - stratifiedReadMap.put(sampleEntry.getKey(), map); - } - - return stratifiedReadMap; - } - private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index 63543c5c2..fd8769f4e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -55,15 +55,12 @@ import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter; -import org.broadinstitute.gatk.utils.haplotypeBAMWriter.ReadDestination; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFHeaderLine; @@ -89,7 +86,6 @@ public class ReferenceConfidenceModel { private final GenomeLocParser genomeLocParser; private final Set samples; - private final SAMFileHeader header; // TODO -- really shouldn't depend on this private final int indelInformativeDepthIndelSize; private final static boolean WRITE_DEBUGGING_BAM = false; @@ -117,7 +113,6 @@ public class ReferenceConfidenceModel { this.genomeLocParser = genomeLocParser; this.samples = samples; - this.header = header; this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; if ( WRITE_DEBUGGING_BAM ) { @@ -326,24 +321,13 @@ public class ReferenceConfidenceModel { if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); - List realignedReads; - - if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner - realignedReads = activeRegion.getReads(); - } else { - final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); - final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); - writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves - writer.setOnlyRealignInformativeReads(true); - writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); - realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); - } + final List reads = activeRegion.getReads(); if ( debuggingWriter != null ) - for ( final GATKSAMRecord read : realignedReads ) + for ( final GATKSAMRecord read : reads ) debuggingWriter.addAlignment(read); - final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, + final LocusIteratorByState libs = new LocusIteratorByState(reads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, true, genomeLocParser, samples, false); final List pileups = new LinkedList<>(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java index 019243364..ac9706011 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -81,16 +81,9 @@ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { final Map stratifiedReadMap) { writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc); - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); - for ( final Haplotype haplotype : haplotypes ) - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - - // next, output the interesting reads for each sample aligned against the appropriate haplotype for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( final Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); + for( final GATKSAMRecord read : readAlleleLikelihoodMap.getLikelihoodReadMap().keySet() ) { + writeReadAgainstHaplotype(read); } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index 4b8075a27..b1c0acb74 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -88,22 +88,9 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); - for ( final Haplotype haplotype : calledHaplotypes ) { - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - } - - // the set of all alleles that were actually called - final Set allelesOfCalledHaplotypes = alleleToHaplotypeMap.keySet(); - - // next, output the interesting reads for each sample aligned against one of the called haplotypes for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( final Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); - final Haplotype haplotype = alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()); - if (haplotype == null) continue; - writeReadAgainstHaplotype(entry.getKey(), haplotype, paddedReferenceLoc.getStart(), bestAllele.isInformative()); + for ( final GATKSAMRecord read : readAlleleLikelihoodMap.getLikelihoodReadMap().keySet() ) { + writeReadAgainstHaplotype(read); } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 704c62a5b..082cd73b5 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -78,7 +78,6 @@ public abstract class HaplotypeBAMWriter { private long uniqueNameCounter = 1; protected final static String READ_GROUP_ID = "ArtificialHaplotype"; - protected final static String HAPLOTYPE_TAG = "HC"; private final ReadDestination output; private boolean writeHaplotypesAsWell = true; @@ -173,108 +172,10 @@ public abstract class HaplotypeBAMWriter { /** * Write out read aligned to haplotype to the BAM file * - * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference - * via the alignment of haplotype (via its getCigar) method. - * * @param originalRead the read we want to write aligned to the reference genome - * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference - * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. - * @param isInformative true if the read is differentially informative for one of the haplotypes */ - protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead, - final Haplotype haplotype, - final int referenceStart, - final boolean isInformative) { - if( onlyRealignInformativeReads && !isInformative ) { - if( originalRead != null ) { - output.add(originalRead); - } - } else if (haplotype == null) { - output.add(originalRead); - return; - } else { - final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); - if ( alignedToRef != null ) { - output.add(alignedToRef); - } else { - output.add(originalRead); - } - } - } - - /** - * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference - * via the alignment of haplotype (via its getCigar) method. - * - * @param originalRead the read we want to write aligned to the reference genome - * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference - * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. - * @param isInformative true if the read is differentially informative for one of the haplotypes - * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible - */ - protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, - final Haplotype haplotype, - final int referenceStart, - final boolean isInformative) { - if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null"); - if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null"); - if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype); - if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); - - try { - // compute the smith-waterman alignment of read -> haplotype - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), CigarUtils.NEW_SW_PARAMETERS); - //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); - if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) - // sw can fail (reasons not clear) so if it happens just don't write the read - return null; - final Cigar swCigar = AlignmentUtils.consolidateCigar(swPairwiseAlignment.getCigar()); - - // since we're modifying the read we need to clone it - final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone(); - - addHaplotypeTag(read, haplotype); - - // uninformative reads are set to zero mapping quality to enhance visualization - if ( !isInformative ) - read.setMappingQuality(0); - - // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar - final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000); - final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1()); - final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; - read.setAlignmentStart(readStartOnReference); - - // compute the read -> ref alignment by mapping read -> hap -> ref from the - // SW of read -> hap mapped through the given by hap -> ref - final Cigar haplotypeToRef = AlignmentUtils.trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1); - final Cigar readToRefCigarRaw = AlignmentUtils.applyCigarToCigar(swCigar, haplotypeToRef); - final Cigar readToRefCigarClean = AlignmentUtils.cleanUpCigar(readToRefCigarRaw); - final Cigar readToRefCigar = AlignmentUtils.leftAlignIndel(readToRefCigarClean, haplotype.getBases(), - originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true); - - read.setCigar(readToRefCigar); - - if ( readToRefCigar.getReadLength() != read.getReadLength() ) - throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() - + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() - + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); - - return read; - } catch ( CloneNotSupportedException e ) { - throw new IllegalStateException("GATKSAMRecords should support clone but this one does not " + originalRead); - } - } - - /** - * Add a haplotype tag to the read based on haplotype - * - * @param read the read to add the tag to - * @param haplotype the haplotype that gives rises to read - */ - private void addHaplotypeTag(final GATKSAMRecord read, final Haplotype haplotype) { - // add a tag to the read that indicates which haplotype it best aligned to. It's a uniquish integer - read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); + protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead) { + output.add(originalRead); } /** @@ -309,7 +210,7 @@ public abstract class HaplotypeBAMWriter { record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); record.setReadName("HC" + uniqueNameCounter++); - addHaplotypeTag(record, haplotype); + record.setAttribute(AlignmentUtils.HAPLOTYPE_TAG, haplotype.hashCode()); record.setReadUnmappedFlag(false); record.setReferenceIndex(paddedRefLoc.getContigIndex()); record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java index dc878a9d3..39171a667 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java @@ -119,7 +119,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { @SuppressWarnings("unused") public void timeLoglessPairHMM(final int reps) { for (int i = 0; i < reps; i++) { - final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, false, + final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonMap("anonymous", dataSet.readList())); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index e668afc69..74226ed13 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d1626d5fff419b8a782de954224e881f"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "bb761c6cd0893e7e96c56bf1d8494588"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -88,13 +88,13 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "724a05b7df716647014f29c0fe86e071"); + "fb1042487efb03896f482c08d7eb2671"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "437ce80af30631de077fd617b13cd2e4"); + "9a6e314d7a8fe1c4a9ff5048544cb670"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { @@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "45975c205e1202d19b235312588dd733"); + "1e4af2f8fc97e0b07e625f142374742f"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index b8d15a2ca..753396775 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -67,12 +67,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "bb6c326616790076de5111a7a3f6feb5"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "06d37e116871d8840d3de17a51049363"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "53517b967a60d800a9d9d1d5a1bc54c9"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "d926d653500a970280ad7828d9ee2b84"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "83ddc16e4f0900429b2da30e582994aa"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "080cf92bfcb7db4273e2d723045daee5"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "a4c0a28a451069faaac937e45c701043"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0bc25b42dc73eeadb23a35227684172c"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "c95a777e3aac50b9f29302919a00ba4e"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "05860d40923e8b074576c787037c5768"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "f3abc3050b2708fc78c903fb70ac253e"}); return tests.toArray(new Object[][]{}); } @@ -137,7 +137,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testWrongGVCFNonVariantRecordOrderBugFix() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("324eb46738a364cd7dc5fa0b62491a5e")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e286b5eb72e3f75e7b08e5729cdfe019")); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index bfafd170e..bd469a45e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -84,27 +84,27 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "9e90f4ed1a04ce159ad3e560e62f5d6b"); + HCTest(CEUTRIO_BAM, "", "030b45c91879eb41f30abbd10ed88fa9"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "96f299a5cf411900b8eda3845c3ce465"); + HCTest(NA12878_BAM, "", "6632475b1cefce9a0c5727bc919501e5"); } @Test public void testHaplotypeCallerMinBaseQuality() { - HCTest(NA12878_BAM, "-mbq 15", "6509cfd0554ecbb92a1b303fbcc0fcca"); + HCTest(NA12878_BAM, "-mbq 15", "31fe63ea44407b6aa40261319a414488"); } @Test public void testHaplotypeCallerGraphBasedSingleSample() { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "f87bb723bfb9b8f604db1d53624d24e3"); + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "5a49b5b98247070e8de637a706b02db9"); } @Test public void testHaplotypeCallerGraphBasedMultiSample() { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "8d1edb0ea25c55a8c825b4e1470e219f"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "d530933588810bb4cf04a9783d187433"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -115,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "d734a90b07dd0d6b051b7c0961787f98"); + "844d1daa4a8835f4c7e10d0d8bbfe510"); } @Test @@ -131,7 +131,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "d3fc49d3d3c8b6439548133e03faa540"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ec77ac891cac9461b2eb011270166dc1"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -168,7 +168,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "a415bc76231a04dc38412ff38aa0dc49"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "59e0f10f8822e2a5e7d2fabb71dbea3f"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -227,7 +227,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("32af503f1d58b1f34d896ea2ddb891aa")); + Arrays.asList("d357f6fe3f1350f886b0df5a207e9f6f")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -236,7 +236,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("e39c73bbaf22b4751755d9f5bb2a8d3d")); + Arrays.asList("e64e06ae90a8eab52cabe3687992da73")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGSGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("39519c4f07d229f266430a0b0cfdf1db")); + Arrays.asList("4e43cf47d32210ee6a6880af3bbfcc52")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -253,7 +253,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("c14d7f23dedea7e7ec99a90843320c1a")); + Arrays.asList("83bf354f0238a362d4b6ef8d14679999")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @@ -276,7 +276,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("d1e1de9f1d33c8c3f96520d674c76cda")); + Arrays.asList("9297d4f5a9eaab253bea1ec19a8581c2")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, - Arrays.asList("10363ebae7edb0e81c2e668f7dbfba15")); + Arrays.asList("de2b002e0311c8fac7d053ecf8cb41f8")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } @@ -304,7 +304,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("4eb63e603e7a99388bb421591350de8c")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5c1323d441ab083dbb569831959a9250")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } @@ -313,7 +313,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testMissingKeyAlternativeHaplotypesBugFix() { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ", b37KGReferenceWithDecoy, privateTestDir + "lost-alt-key-hap.bam", privateTestDir + "lost-alt-key-hap.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6c70b7a52803f0bcfc53a14b894834df")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("d8c6197a7060c16c849e15460a4287f2")); spec.disableShadowBCF(); executeTest("testMissingKeyAlternativeHaplotypesBugFix", spec); } @@ -323,7 +323,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ", hg19RefereneWithChrPrefixInChromosomeNames, privateTestDir + "bad-likelihoods.bam", privateTestDir + "bad-likelihoods.interval_list", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("0626e4c7543acc0cbb06c34435a2b730")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e36179dcdfb6f2a4269d08543479e97c")); spec.disableShadowBCF(); executeTest("testBadLikelihoodsDueToBadHaplotypeSelectionFix", spec); } @@ -346,9 +346,9 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // but please make sure that both outputs get the same variant, // alleles all with DBSNP ids // We test here that change in active region size does not have an effect in placement of indels. - final WalkerTestSpec shortSpec = new WalkerTestSpec(commandLineShortInterval + " -o %s",Arrays.asList("f5de88bb1a1911eb5e6540a59f1517e2")); + final WalkerTestSpec shortSpec = new WalkerTestSpec(commandLineShortInterval + " -o %s",Arrays.asList("8512638ddb84eec5f58eaa1be6f58401")); executeTest("testDifferentIndelLocationsDueToSWExactDoubleComparisonsFix::shortInterval",shortSpec); - final WalkerTestSpec longSpec = new WalkerTestSpec(commandLineLongInterval + " -o %s",Arrays.asList("fb957604f506fe9a62138943bd82543e")); + final WalkerTestSpec longSpec = new WalkerTestSpec(commandLineLongInterval + " -o %s",Arrays.asList("9645d154812ae4db5357e7e5628936ab")); executeTest("testDifferentIndelLocationsDueToSWExactDoubleComparisonsFix::longInterval",longSpec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java index 643fe59a5..fab7bf69f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java @@ -132,7 +132,7 @@ public class PairHMMLikelihoodCalculationEngineUnitTest extends BaseTest { @Test(dataProvider = "PcrErrorModelTestProvider", enabled = true) public void createPcrErrorModelTest(final String repeat, final int repeatLength) { - final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte)0, false, + final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte)0, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java index 5906fbc6c..5edb1223a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java @@ -91,7 +91,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi //final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, false, // PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3); - final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, false, + final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, Double.NEGATIVE_INFINITY,true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); // When using likelihoods it should be around 0.05 since diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java index 52efb030c..bf751e1a2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -166,17 +166,16 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { } - @Test(dataProvider = "ReadAlignedToRefData", enabled = true) public void testReadAlignedToRef(final GATKSAMRecord read, final Haplotype haplotype, final int refStart, final int expectedReadStart, final String expectedReadCigar) throws Exception { final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination()); final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone(); if ( expectedReadCigar == null ) { - Assert.assertNull(writer.createReadAlignedToRef(read, haplotype, refStart, true)); + Assert.assertNull(AlignmentUtils.createReadAlignedToRef(read, haplotype, refStart, true)); } else { final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedReadCigar); - final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, refStart, true); + final GATKSAMRecord alignedRead = AlignmentUtils.createReadAlignedToRef(read, haplotype, refStart, true); Assert.assertEquals(alignedRead.getReadName(), originalReadCopy.getReadName()); Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart); @@ -286,7 +285,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { @Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG) public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception { final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination()); - final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, 1, true); + final GATKSAMRecord alignedRead = AlignmentUtils.createReadAlignedToRef(read, haplotype, 1, true); if ( alignedRead != null ) { final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches; Assert.assertTrue(mismatches <= expectedMaxMismatches, diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java index fe880ec8a..55e66244c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java @@ -430,12 +430,12 @@ public final class GenomeLocParser { // -------------------------------------------------------------------------------------------------------------- /** - * create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position, - * then a GenomeLoc is returned for contig:start-start, otherwise and UNMAPPED GenomeLoc is returned. + * Create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position, + * then a GenomeLoc is returned for contig:start-start, otherwise an UNMAPPED GenomeLoc is returned. * - * @param read + * @param read the read from which to create a genome loc * - * @return + * @return the GenomeLoc that was created */ @Requires("read != null") @Ensures("result != null") @@ -445,11 +445,32 @@ public final class GenomeLocParser { return GenomeLoc.UNMAPPED; else { // Use Math.max to ensure that end >= start (Picard assigns the end to reads that are entirely within an insertion as start-1) - int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : Math.max(read.getAlignmentEnd(), read.getAlignmentStart()); + final int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : Math.max(read.getAlignmentEnd(), read.getAlignmentStart()); return createGenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end, false); } } + /** + * Create a genome loc, given a read using its unclipped alignment. If the read is unmapped, *and* yet the read has a contig and start position, + * then a GenomeLoc is returned for contig:start-start, otherwise an UNMAPPED GenomeLoc is returned. + * + * @param read the read from which to create a genome loc + * + * @return the GenomeLoc that was created + */ + @Requires("read != null") + @Ensures("result != null") + public GenomeLoc createGenomeLocUnclipped(final SAMRecord read) { + if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == -1 ) + // read is unmapped and not placed anywhere on the genome + return GenomeLoc.UNMAPPED; + else { + // Use Math.max to ensure that end >= start (Picard assigns the end to reads that are entirely within an insertion as start-1) + final int end = read.getReadUnmappedFlag() ? read.getUnclippedEnd() : Math.max(read.getUnclippedEnd(), read.getUnclippedStart()); + return createGenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getUnclippedStart(), end, false); + } + } + /** * Creates a GenomeLoc from a Tribble feature * @param feature diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java index d08c5d2ce..f4ca70e84 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java @@ -67,12 +67,7 @@ public class ClippingOp { * @param originalRead the read to be clipped */ public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { - GATKSAMRecord read; - try { - read = (GATKSAMRecord) originalRead.clone(); - } catch (CloneNotSupportedException e) { - throw new ReviewedGATKException("Where did the clone go?"); - } + GATKSAMRecord read = (GATKSAMRecord) originalRead.clone(); byte[] quals = read.getBaseQualities(); byte[] bases = read.getReadBases(); byte[] newBases = new byte[bases.length]; @@ -169,14 +164,7 @@ public class ClippingOp { } private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { - GATKSAMRecord unclipped; - - // shallow copy of the read bases and quals should be fine here because they are immutable in the original read - try { - unclipped = (GATKSAMRecord) read.clone(); - } catch (CloneNotSupportedException e) { - throw new ReviewedGATKException("Where did the clone go?"); - } + GATKSAMRecord unclipped = (GATKSAMRecord) read.clone(); Cigar unclippedCigar = new Cigar(); int matchesCount = 0; @@ -376,12 +364,7 @@ public class ClippingOp { System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); - final GATKSAMRecord hardClippedRead; - try { - hardClippedRead = (GATKSAMRecord) read.clone(); - } catch (CloneNotSupportedException e) { - throw new ReviewedGATKException("Where did the clone go?"); - } + final GATKSAMRecord hardClippedRead = (GATKSAMRecord) read.clone(); hardClippedRead.resetSoftStartAndEnd(); // reset the cached soft start and end because they may have changed now that the read was hard clipped. No need to calculate them now. They'll be lazily calculated on the next call to getSoftStart()/End() hardClippedRead.setBaseQualities(newQuals); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java index acd6319a2..3d2740769 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java @@ -41,11 +41,7 @@ import java.util.List; public class DupUtils { private static GATKSAMRecord tmpCopyRead(GATKSAMRecord read) { - try { - return (GATKSAMRecord)read.clone(); - } catch ( CloneNotSupportedException e ) { - throw new ReviewedGATKException("Unexpected Clone failure!"); - } + return (GATKSAMRecord)read.clone(); } public static GATKSAMRecord combineDuplicates(GenomeLocParser genomeLocParser,List duplicates, int maxQScore) { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java index 49d8962b9..553823e33 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -28,9 +28,12 @@ package org.broadinstitute.gatk.utils.genotyper; import com.google.java.contract.Ensures; import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import htsjdk.variant.variantcontext.Allele; @@ -227,7 +230,6 @@ public class PerReadAlleleLikelihoodMap { double haplotypeLikelihood = 0.0; for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - final GATKSAMRecord read = entry.getKey(); final double likelihood_iii = entry.getValue().get(iii_allele); final double likelihood_jjj = entry.getValue().get(jjj_allele); haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF; @@ -381,4 +383,27 @@ public class PerReadAlleleLikelihoodMap { public Set getAllelesSet() { return Collections.unmodifiableSet(allelesSet); } + + /** + * Loop over all of the reads in this likelihood map and realign them to its most likely haplotype + * @param haplotypes the collection of haplotypes + * @param paddedReferenceLoc the active region + */ + public void realignReadsToMostLikelyHaplotype(final Collection haplotypes, final GenomeLoc paddedReferenceLoc) { + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + + final Map> newLikelihoodReadMap = new LinkedHashMap<>(likelihoodReadMap.size()); + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + final GATKSAMRecord alignedToRef = AlignmentUtils.createReadAlignedToRef(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); + newLikelihoodReadMap.put(alignedToRef, entry.getValue()); + } + + likelihoodReadMap.clear(); + likelihoodReadMap.putAll(newLikelihoodReadMap); + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java index 618128c4b..289275abd 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java @@ -31,19 +31,21 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.smithwaterman.SWPairwiseAlignment; import java.util.*; public final class AlignmentUtils { - private final static Logger logger = Logger.getLogger(AlignmentUtils.class); private final static EnumSet ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); private final static EnumSet ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S); + public final static String HAPLOTYPE_TAG = "HC"; // cannot be instantiated private AlignmentUtils() { } @@ -65,6 +67,65 @@ public final class AlignmentUtils { return first == CigarOperator.D || first == CigarOperator.I || last == CigarOperator.D || last == CigarOperator.I; } + /** + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + * @param isInformative true if the read is differentially informative for one of the haplotypes + * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible + */ + public static GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart, + final boolean isInformative) { + if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null"); + if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null"); + if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype); + if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); + + // compute the smith-waterman alignment of read -> haplotype + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), CigarUtils.NEW_SW_PARAMETERS); + if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) + // sw can fail (reasons not clear) so if it happens just don't realign the read + return originalRead; + final Cigar swCigar = consolidateCigar(swPairwiseAlignment.getCigar()); + + // since we're modifying the read we need to clone it + final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone(); + + // only informative reads are given the haplotype tag to enhance visualization + if ( isInformative ) + read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); + + // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar + final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000); + final int readStartOnHaplotype = calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1()); + final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; + read.setAlignmentStart(readStartOnReference); + read.resetSoftStartAndEnd(); + + // compute the read -> ref alignment by mapping read -> hap -> ref from the + // SW of read -> hap mapped through the given by hap -> ref + final Cigar haplotypeToRef = trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1); + final Cigar readToRefCigarRaw = applyCigarToCigar(swCigar, haplotypeToRef); + final Cigar readToRefCigarClean = cleanUpCigar(readToRefCigarRaw); + final Cigar readToRefCigar = leftAlignIndel(readToRefCigarClean, haplotype.getBases(), + originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true); + + read.setCigar(readToRefCigar); + + if ( readToRefCigar.getReadLength() != read.getReadLength() ) + throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() + + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() + + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); + + return read; + } + + /** * Get the byte[] from bases that cover the reference interval refStart -> refEnd given the diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java index c0f22201e..18a91b3df 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java @@ -49,7 +49,7 @@ import java.util.*; * Changing these values in any way will invalidate the cached value. However, we do not monitor those setter * functions, so modifying a GATKSAMRecord in any way may result in stale cached values. */ -public class GATKSAMRecord extends BAMRecord { +public class GATKSAMRecord extends BAMRecord implements Cloneable { // Base Quality Score Recalibrator specific attribute tags public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions @@ -593,17 +593,20 @@ public class GATKSAMRecord extends BAMRecord { * This should be safe because callers should never modify a mutable value returned by any of the get() methods anyway. * * @return a shallow copy of the GATKSAMRecord - * @throws CloneNotSupportedException */ @Override - public Object clone() throws CloneNotSupportedException { - final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); - if (temporaryAttributes != null) { - clone.temporaryAttributes = new HashMap<>(); - for (Object attribute : temporaryAttributes.keySet()) - clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); + public Object clone() { + try { + final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); + if (temporaryAttributes != null) { + clone.temporaryAttributes = new HashMap<>(); + for (Object attribute : temporaryAttributes.keySet()) + clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); + } + return clone; + } catch (final CloneNotSupportedException e) { + throw new RuntimeException( e ); } - return clone; } /** diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java index feb065ad6..32815706e 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java @@ -28,6 +28,7 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.*; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -172,7 +173,6 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(AlignmentUtils.calcNumDifferentBases(cigar, ref.getBytes(), read.getBytes()), expectedDifferences); } - @DataProvider(name = "NumAlignedBasesCountingSoftClips") public Object[][] makeNumAlignedBasesCountingSoftClips() { List tests = new ArrayList();