From 42d3919ca4c5fc5f05b700897937a87c3ac8017f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 22 Feb 2013 15:22:43 -0500 Subject: [PATCH] Expanded functionality for writing BAMs from HaplotypeCaller -- The new code includes a new mode to write out a BAM containing reads realigned to the called haplotypes from the HC, which can be easily visualized in IGV. -- Previous functionality maintained, with bug fixes -- Haplotype BAM writing code now lives in utils -- Created a base class that includes most of the functionality of writing reads realigned to haplotypes onto haplotypes. -- Created two subclasses, one that writes all haplotypes (previous functionality) and a CalledHaplotypeBAMWriter that will only write reads aligned to the actually called haplotypes -- Extended PerReadAlleleLikelihoodMap.getMostLikelyAllele to optionally restrict set of alleles to consider best -- Massive increase in unit tests in AlignmentUtils, along with several new powerful functions for manipulating cigars -- Fix bug in SWPairwiseAlignment that produces cigar elements with 0 size, and are now fixed with consolidateCigar in AlignmentUtils -- HaplotypeCaller now tracks the called haplotypes in the GenotypingEngine, and returns this information to the HC for use in visualization. -- Added extensive docs to HaplotypeCaller on how to use this capability -- BUGFIX -- don't modify the read bases in GATKSAMRecord in LikelihoodCalculationEngine in the HC -- Cleaned up SWPairwiseAlignment. Refactored out the big main and supplementary static methods. Added a unit test with a bug TODO to fix what seems to be an edge case bug in SW -- Integration test to make sure we can actually write a BAM for each mode. This test only ensures that the code runs and doesn't exception out. It doesn't actually enforce any MD5s -- HaplotypeBAMWriter also left aligns indels in the reads, as SW can return a random placement of a read against the haplotype. Calls leftAlign to make the alignments more clear, with unit test of real read to cover this case -- Writes out haplotypes for both all haplotype and called haplotype mode -- Haplotype writers now get the active region call, regardless of whether an actual call was made. Only emitting called haplotypes is moved down to CalledHaplotypeBAMWriter --- .../haplotypecaller/GenotypingEngine.java | 63 ++- .../haplotypecaller/HaplotypeCaller.java | 175 ++----- .../LikelihoodCalculationEngine.java | 3 +- .../HaplotypeCallerModesIntegrationTest.java | 85 ++++ .../utils/SWPairwiseAlignmentUnitTest.java | 94 ++++ .../broadinstitute/sting/utils/Haplotype.java | 16 + .../sting/utils/SWPairwiseAlignment.java | 446 ++---------------- .../sting/utils/SWPairwiseAlignmentMain.java | 222 +++++++++ .../org/broadinstitute/sting/utils/Utils.java | 11 + .../genotyper/PerReadAlleleLikelihoodMap.java | 21 +- .../AllHaplotypeBAMWriter.java | 80 ++++ .../CalledHaplotypeBAMWriter.java | 87 ++++ .../HaplotypeBAMWriter.java | 282 +++++++++++ .../sting/utils/sam/AlignmentUtils.java | 391 ++++++++++++++- .../sting/utils/HaplotypeUnitTest.java | 20 + .../sting/utils/UtilsUnitTest.java | 8 + .../HaplotypeBAMWriterUnitTest.java | 287 +++++++++++ .../utils/sam/AlignmentUtilsUnitTest.java | 354 ++++++++++++-- 18 files changed, 2050 insertions(+), 595 deletions(-) create mode 100644 protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java create mode 100644 protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index bef0cd96c..ae181aa69 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -79,6 +79,39 @@ public class GenotypingEngine { noCall.add(Allele.NO_CALL); } + /** + * Carries the result of a call to #assignGenotypeLikelihoods + */ + public static class CalledHaplotypes { + private final List calls; + private final Set calledHaplotypes; + + protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { + if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) + throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes); + this.calls = calls; + this.calledHaplotypes = calledHaplotypes; + } + + /** + * Get the list of calls made at this location + * @return a non-null (but potentially empty) list of calls + */ + public List getCalls() { + return calls; + } + + /** + * Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls(). + * @return a non-null set of haplotypes + */ + public Set getCalledHaplotypes() { + return calledHaplotypes; + } + } + /** * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling @@ -93,21 +126,21 @@ public class GenotypingEngine { * @param activeRegionWindow Active window * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype - * @return List of VC's with genotyped events + * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes */ @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - public List assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, - final List haplotypes, - final List samples, - final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final List activeAllelesToGenotype ) { + public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final List samples, + final Map haplotypeReadMap, + final Map> perSampleFilteredReadList, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { // sanity check input arguments if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); @@ -157,6 +190,8 @@ public class GenotypingEngine { } } + final Set calledHaplotypes = new HashSet(); + // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region @@ -239,6 +274,10 @@ public class GenotypingEngine { final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call); + // maintain the set of all called haplotypes + for ( final Allele calledAllele : call.getAlleles() ) + calledHaplotypes.addAll(alleleMapper.get(calledAllele)); + if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); } @@ -247,7 +286,7 @@ public class GenotypingEngine { } } } - return returnCalls; + return new CalledHaplotypes(returnCalls, calledHaplotypes); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 64c762e97..003b8197f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -72,22 +71,23 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; import java.io.FileNotFoundException; import java.io.PrintStream; @@ -146,15 +146,39 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected PrintStream graphWriter = null; /** - * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here - * does not include uninformative reads so that not every input read is emitted to the bam. + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. + * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. + * + * Turning on this mode may result in serious performance cost for the HC. It's really only approprate to + * use in specific areas where you want to better understand why the HC is making specific calls. + * + * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches + * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended + * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more + * easily see which reads go with these haplotype. + * + * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire + * active region, coming from read HC and a special read group. + * + * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean + * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to + * its next best haplotype. + * + * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, + * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV + * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen + * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png + * */ - @Hidden - @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false) + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false) protected StingSAMFileWriter bamWriter = null; - private SAMFileHeader bamHeader = null; - private long uniqueNameCounter = 1; - private final static String readGroupId = "ArtificialHaplotype"; + private HaplotypeBAMWriter haplotypeBAMWriter; + + /** + * The type of BAM output we want to see. + */ + @Output(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false) + public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. @@ -354,7 +378,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); if ( bamWriter != null ) - setupBamWriter(); + haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); } //--------------------------------------------------------------------------------------------------------------- @@ -497,39 +521,25 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes ); - for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoods( UG_engine, - bestHaplotypes, - samplesList, - stratifiedReadMap, - perSampleFilteredReadList, - fullReferenceWithPadding, - paddedReferenceLoc, - activeRegion.getLocation(), - getToolkit().getGenomeLocParser(), - activeAllelesToGenotype ) ) { + final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, + bestHaplotypes, + samplesList, + stratifiedReadMap, + perSampleFilteredReadList, + fullReferenceWithPadding, + paddedReferenceLoc, + activeRegion.getLocation(), + getToolkit().getGenomeLocParser(), + activeAllelesToGenotype ); + + for( final VariantContext call : calledHaplotypes.getCalls() ) { // TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker. // annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } if ( bamWriter != null ) { - // write the haplotypes to the bam - for ( Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); - - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); - for ( final Haplotype haplotype : haplotypes ) - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - - // next, output the interesting reads for each sample aligned against the appropriate haplotype - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); - } - } + haplotypeBAMWriter.writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, bestHaplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } @@ -624,92 +634,5 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return returnMap; } - private void setupBamWriter() { - // prepare the bam header - bamHeader = new SAMFileHeader(); - bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); - bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - // include the original read groups plus a new artificial one for the haplotypes - final List readGroups = new ArrayList(getToolkit().getSAMFileHeader().getReadGroups()); - final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); - rg.setSample("HC"); - rg.setSequencingCenter("BI"); - readGroups.add(rg); - bamHeader.setReadGroups(readGroups); - - bamWriter.setPresorted(false); - bamWriter.writeHeader(bamHeader); - } - - private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { - final GATKSAMRecord record = new GATKSAMRecord(bamHeader); - record.setReadBases(haplotype.getBases()); - record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); - record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); - record.setCigar(haplotype.getCigar()); - record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); - record.setReadName("HC" + uniqueNameCounter++); - record.setReadUnmappedFlag(false); - record.setReferenceIndex(paddedRefLoc.getContigIndex()); - record.setAttribute(SAMTag.RG.toString(), readGroupId); - record.setFlags(16); - bamWriter.addAlignment(record); - } - - private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) { - - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2); - final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1(); - final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; - read.setAlignmentStart(readStartOnReference); - - final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar()); - read.setCigar(cigar); - - bamWriter.addAlignment(read); - } - - private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) { - - int currentReadPos = 0; - int currentHapPos = 0; - final List readCigarElements = new ArrayList(); - - for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) { - - if ( cigarElement.getOperator() == CigarOperator.D ) { - if ( currentReadPos > 0 ) - readCigarElements.add(cigarElement); - } else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) { - - final int elementLength = cigarElement.getLength(); - final int nextReadPos = currentReadPos + elementLength; - final int nextHapPos = currentHapPos + elementLength; - - // do we want this element? - if ( currentReadPos > 0 ) { - // do we want the entire element? - if ( nextReadPos < read.getReadLength() ) { - readCigarElements.add(cigarElement); - currentReadPos = nextReadPos; - } - // otherwise, we can finish up and return the cigar - else { - readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator())); - return new Cigar(readCigarElements); - } - } - // do we want part of the element to start? - else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) { - currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength()); - readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator())); - } - - currentHapPos = nextHapPos; - } - } - - return new Cigar(readCigarElements); - } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index aeeb95c87..a7d85b969 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -133,7 +133,8 @@ public class LikelihoodCalculationEngine { final byte[] overallGCP = new byte[read.getReadLength()]; Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? Haplotype previousHaplotypeSeen = null; - final byte[] readQuals = read.getBaseQualities(); + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities(); for( int kkk = 0; kkk < readQuals.length; kkk++ ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java new file mode 100644 index 000000000..27b429353 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java @@ -0,0 +1,85 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +public class HaplotypeCallerModesIntegrationTest extends WalkerTest { + // -------------------------------------------------------------------------------------------------------------- + // + // testing that writing a BAM works + // + // I don't really care about the MD5s, so I'm just not providing them here, so they don't have to be + // updated. These tests are basically ensuring that the code doesn't just randomly blow up. + // + // TODO -- what i'd really like to ensure here isn't the MD5 but that the BAMs can be read by the GATK or IGV + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestBamWriterCalledHaplotypes() { + HCTestBamWriter(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, ""); // current MD5 = 9a2b6157f14b44b872a77f4e75c56023 + } + + @Test + public void HCTestBamWriterAllHaplotypes() { + HCTestBamWriter(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, ""); // current MD5 = 06d885d82be81b8eef13bbfcd8041189 + } + + public void HCTestBamWriter(final HaplotypeBAMWriter.Type type, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o /dev/null " + + "-bamout %s -L 20:10,000,000-10,010,000 -bamWriterType " + type, 1, + Arrays.asList(md5)); + executeTest("HC writing bams with mode " + type, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java new file mode 100644 index 000000000..6d3c310b7 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/SWPairwiseAlignmentUnitTest.java @@ -0,0 +1,94 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +public class SWPairwiseAlignmentUnitTest extends BaseTest { + @DataProvider(name = "ComplexReadAlignedToRef") + public Object[][] makeComplexReadAlignedToRef() { + List tests = new ArrayList(); + + final String ref1 = "ACTGACTGACTG"; + tests.add(new Object[]{"AAAGGACTGACTG", ref1, 1, "12M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ComplexReadAlignedToRef", enabled = true) + public void testReadAlignedToRefComplexAlignment(final String reference, final String read, final int expectedStart, final String expectedCigar) { + final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes()); + Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); + Assert.assertEquals(sw.getCigar().toString(), expectedCigar); + } + + // TODO + // TODO + // TODO this example demonstrates some kind of failure mode of SW that results in the read not being aligned + // TODO to the reference at all. It has something to do with the specific parameters provided to the + // TODO SW code. With the default parameters the result is the one expected. With the specified parameters + // TODO the code fails + // TODO + // TODO + @Test(enabled = false) + public void testOddNoAlignment() { + final String reference = "AAAGACTACTG"; + final String read = "AACGGACACTG"; + final int expectedStart = 0; + final String expectedCigar = "11M"; + final SWPairwiseAlignment sw = new SWPairwiseAlignment(reference.getBytes(), read.getBytes(), 5.0, -10.0, -22.0, -1.2); + sw.printAlignment(reference.getBytes(), read.getBytes()); + Assert.assertEquals(sw.getAlignmentStart2wrt1(), expectedStart); + Assert.assertEquals(sw.getCigar().toString(), expectedCigar); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index cce6abbee..415cb73ac 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -27,9 +27,12 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Requires; import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -121,6 +124,19 @@ public class Haplotype extends Allele { return cigar; } + /** + * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar + * + * @param padSize how many additional Ms should be appended to the end of this cigar. Must be >= 0 + * @return a newly allocated Cigar that consolidate(getCigar + padSize + M) + */ + public Cigar getConsolidatedPaddedCigar(final int padSize) { + if ( padSize < 0 ) throw new IllegalArgumentException("padSize must be >= 0 but got " + padSize); + final Cigar extendedHaplotypeCigar = new Cigar(getCigar().getCigarElements()); + if ( padSize > 0 ) extendedHaplotypeCigar.add(new CigarElement(padSize, CigarOperator.M)); + return AlignmentUtils.consolidateCigar(extendedHaplotypeCigar); + } + public void setCigar( final Cigar cigar ) { this.cigar = cigar; } diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index 7bd937af9..e501cf40a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -1,28 +1,28 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + package org.broadinstitute.sting.utils; import net.sf.samtools.Cigar; @@ -30,17 +30,23 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import java.util.*; /** - * Created by IntelliJ IDEA. + * Pairwise discrete smith-waterman alignment + * + * ************************************************************************ + * **** IMPORTANT NOTE: **** + * **** This class assumes that all bytes come from UPPERCASED chars! **** + * ************************************************************************ + * * User: asivache * Date: Mar 23, 2009 * Time: 1:54:54 PM - * To change this template use File | Settings | File Templates. */ -public class SWPairwiseAlignment { +public final class SWPairwiseAlignment { private int alignment_offset; // offset of s2 w/respect to s1 private Cigar alignmentCigar; @@ -54,24 +60,11 @@ public class SWPairwiseAlignment { private static final int DSTATE = 2; private static final int CLIP = 3; - private static boolean cutoff = false; + protected static boolean cutoff = false; private static boolean DO_SOFTCLIP = true; double[] SW; -// private double [] best_gap_v ; -// private int [] gap_size_v ; -// private double [] best_gap_h ; -// private int [] gap_size_h ; - - - // private static double [][] sw = new double[500][500]; - // private static int [][] btrack = new int[500][500]; - - // ************************************************************************ - // **** IMPORTANT NOTE: **** - // **** This class assumes that all bytes come from UPPERCASED chars! **** - // ************************************************************************ public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { w_match = match; w_mismatch = mismatch; @@ -80,12 +73,10 @@ public class SWPairwiseAlignment { align(seq1,seq2); } - public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) } - public Cigar getCigar() { return alignmentCigar ; } public int getAlignmentStart2wrt1() { return alignment_offset; } @@ -97,13 +88,6 @@ public class SWPairwiseAlignment { SW = sw; int [] btrack = new int[(n+1)*(m+1)]; -// best_gap_v = new double[m+1]; -// Arrays.fill(best_gap_v,-1.0e40); -// gap_size_v = new int[m+1]; -// best_gap_h = new double[n+1]; -// Arrays.fill(best_gap_h,-1.0e40); -// gap_size_h = new int[n+1]; - calculateMatrix(a, b, sw, btrack); calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) } @@ -169,18 +153,6 @@ public class SWPairwiseAlignment { final double step_down = best_gap_v[j] ; final int kd = gap_size_v[j]; -/* - for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) { - // data_offset_k is linearized offset of element [i-k][j] - // in other words, trial = sw[i-k][j]+gap_penalty: - final double trial = sw[data_offset_k]+wk(k); - if ( step_down < trial ) { - step_down=trial; - kd = k; - } - } -*/ - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing // all 'step right' events that would end in the current cell. The optimized code // does exactly the same thing as the commented out loop below. IMPORTANT: @@ -202,21 +174,6 @@ public class SWPairwiseAlignment { final double step_right = best_gap_h[i]; final int ki = gap_size_h[i]; -/* - for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) { - // data_offset is linearized offset of element [i][j-k] - // in other words, step_right=sw[i][j-k]+gap_penalty; - final double trial = sw[data_offset]+wk(k); - if ( step_right < trial ) { - step_right=trial; - ki = k; - } - } - - final int data_offset = row_offset + j; // linearized offset of element [i][j] -*/ - - if ( step_down > step_right ) { if ( step_down > step_diag ) { sw[data_offset] = Math.max(MATRIX_MIN_CUTOFF,step_down); @@ -235,8 +192,6 @@ public class SWPairwiseAlignment { btrack[data_offset] = 0; // 0 = diagonal } } - -// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); } // IMPORTANT, IMPORTANT, IMPORTANT: @@ -245,7 +200,6 @@ public class SWPairwiseAlignment { // in the for() statement itself. row_offset_1 = row_offset; } -// print(sw,a,b); } @@ -271,12 +225,10 @@ public class SWPairwiseAlignment { if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) { p1 = n; p2 = j ; -// maxscore = sw[n][j]; maxscore = sw[data_offset]; segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment } } -// System.out.println(" Found max score="+maxscore+" at p1="+p1+ " p2="+p2); List lce = new ArrayList(5); @@ -291,16 +243,12 @@ public class SWPairwiseAlignment { int state = MSTATE; int data_offset = p1*(m+1)+p2; // offset of element [p1][p2] - // System.out.println("Backtracking: starts at "+p1+":"+p2+" ("+sw[data_offset]+")"); do { -// int btr = btrack[p1][p2]; int btr = btrack[data_offset]; int new_state; int step_length = 1; - // System.out.print(" backtrack value: "+btr); - if ( btr > 0 ) { new_state = DSTATE; step_length = btr; @@ -309,25 +257,16 @@ public class SWPairwiseAlignment { step_length = (-btr); } else new_state = MSTATE; // and step_length =1, already set above - // move to next best location in the sw matrix: switch( new_state ) { case MSTATE: data_offset -= (m+2); p1--; p2--; break; // move back along the diag in th esw matrix case ISTATE: data_offset -= step_length; p2 -= step_length; break; // move left case DSTATE: data_offset -= (m+1)*step_length; p1 -= step_length; break; // move up } - // System.out.println("; backtracked to p1="+p1+" p2="+p2); - /* - switch( new_state ) { - case MSTATE: System.out.println(" diag (match) to "+ sw[data_offset]); break; // equivalent to p1--; p2-- - case ISTATE: System.out.println(" left (insertion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p2-=step_length; - case DSTATE: System.out.println(" up (deletion, "+step_length+") to "+ sw[data_offset]); break; // equivalent to p1 -= step_up - } - */ + // now let's see if the state actually changed: if ( new_state == state ) segment_length+=step_length; else { -// System.out.println(" emitting "+segment_length+makeElement(state,segment_length).getOperator().toString()); // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). lce.add(makeElement(state, segment_length)); segment_length = step_length; @@ -354,11 +293,9 @@ public class SWPairwiseAlignment { } Collections.reverse(lce); - alignmentCigar = new Cigar(lce); - + alignmentCigar = AlignmentUtils.consolidateCigar(new Cigar(lce)); } - private CigarElement makeElement(int state, int segment_length) { CigarOperator o = null; switch(state) { @@ -374,33 +311,11 @@ public class SWPairwiseAlignment { return (x == y ? w_match : w_mismatch); } - private double wk(int k) { - return w_open+(k-1)*w_extend; // gap - } - - private void print(double[] s, byte[] a, byte[] b) { - int n = a.length+1; - int m = b.length+1; - System.out.print(" "); - for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ; - System.out.println(); - - for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) { - if ( i > 0 ) System.out.print((char)a[i-1]); - else System.out.print(' '); - System.out.print(" "); - for ( int j = 0; j < m ; j++ ) { - System.out.printf(" %5.1f",s[row_offset+j]); - } - System.out.println(); - } - } - - static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read) { - printAlignment(a,ref,read,100); + public void printAlignment(byte[] ref, byte[] read) { + printAlignment(ref,read,100); } - static void printAlignment(SWPairwiseAlignment a, byte[] ref, byte[] read, int width) { + public void printAlignment(byte[] ref, byte[] read, int width) { StringBuilder bread = new StringBuilder(); StringBuilder bref = new StringBuilder(); StringBuilder match = new StringBuilder(); @@ -408,9 +323,9 @@ public class SWPairwiseAlignment { int i = 0; int j = 0; - final int offset = a.getAlignmentStart2wrt1(); + final int offset = getAlignmentStart2wrt1(); - Cigar cigar = a.getCigar(); + Cigar cigar = getCigar(); if ( ! DO_SOFTCLIP ) { @@ -436,7 +351,7 @@ public class SWPairwiseAlignment { } if ( offset > 0 ) { // note: the way this implementation works, cigar will ever start from S *only* if read starts before the ref, i.e. offset = 0 - for ( ; i < a.getAlignmentStart2wrt1() ; i++ ) { + for ( ; i < getAlignmentStart2wrt1() ; i++ ) { bref.append((char)ref[i]); bread.append(' '); match.append(' '); @@ -506,280 +421,5 @@ public class SWPairwiseAlignment { } int end = Math.min(start+width,s.length()); System.out.println(s.substring(start,end)); - } - -// BELOW: main() method for testing; old implementations of the core methods are commented out below; -// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed. - - public static void main(String argv[]) { -// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT"; -// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG"; - - String ref = null; - String read = null; - - Map> args = processArgs(argv); - - List l = args.get("SEQ"); - args.remove("SEQ"); - if ( l == null ) { - System.err.println("SEQ argument is missing. Two input sequences must be provided"); - System.exit(1); - } - if ( l.size() != 2 ) { - System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead"); - System.exit(1); - } - - ref = l.get(0); - read = l.get(1); - - Double m = extractSingleDoubleArg("MATCH",args); - Double mm = extractSingleDoubleArg("MISMATCH",args); - Double open = extractSingleDoubleArg("OPEN",args); - Double ext = extractSingleDoubleArg("EXTEND",args); - - Boolean reverse = extractSingleBooleanArg("REVERSE",args); - if ( reverse != null && reverse.booleanValue() == true ) { - ref = Utils.reverse(ref); - read = Utils.reverse(read); - } - - Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args); - Boolean cut = extractSingleBooleanArg("CUTOFF",args); - if ( cut != null ) SWPairwiseAlignment.cutoff = cut; - - if ( args.size() != 0 ) { - System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next()); - System.exit(1); - } - - double w_match; - double w_mismatch; - double w_open; - double w_extend; - - w_match = (m == null ? 30.0 : m.doubleValue()); - w_mismatch = (mm == null ? -10.0 : mm.doubleValue()); - w_open = (open == null ? -10.0 : open.doubleValue()); - w_extend = (ext == null ? -2.0 : ext.doubleValue()); - - - SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); - - System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ - " length1="+ref.length()+" length2="+read.length()); - - - System.out.println(); - printAlignment(a,ref.getBytes(),read.getBytes()); - - System.out.println(); - if ( print_mat != null && print_mat == true ) { - a.print(a.SW,ref.getBytes(),read.getBytes()); - } - } - - - static Pair getArg(String prefix, String argv[], int i) { - String arg = null; - if ( argv[i].startsWith(prefix) ) { - arg = argv[i].substring(prefix.length()); - if( arg.length() == 0 ) { - i++; - if ( i < argv.length ) arg = argv[i]; - else { - System.err.println("No value found after " + prefix + " argument tag"); - System.exit(1); - } - } - i++; - } - return new Pair(arg,i); - } - - static Map> processArgs(String argv[]) { - Map> args = new HashMap>(); - - for ( int i = 0; i < argv.length ; i++ ) { - String arg = argv[i]; - int pos = arg.indexOf('='); - if ( pos < 0 ) { - System.err.println("Argument "+arg+" is not of the form ="); - System.exit(1); - } - String val = arg.substring(pos+1); - if ( val.length() == 0 ) { - // there was a space between '=' and the value - i++; - if ( i < argv.length ) val = argv[i]; - else { - System.err.println("No value found after " + arg + " argument tag"); - System.exit(1); - } - } - arg = arg.substring(0,pos); - - List l = args.get(arg); - if ( l == null ) { - l = new ArrayList(); - args.put(arg,l); - } - l.add(val); - } - return args; - } - - static Double extractSingleDoubleArg(String argname, Map> args) { - List l = args.get(argname); - args.remove(argname); - if ( l == null ) return null; - - if ( l.size() > 1 ) { - System.err.println("Only one "+argname+" argument is allowed"); - System.exit(1); - } - double d=0; - try { - d = Double.parseDouble(l.get(0)); - } catch ( NumberFormatException e) { - System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")"); - System.exit(1); - } - System.out.println("Argument "+argname+" set to "+d); - return new Double(d); - } - - - static Boolean extractSingleBooleanArg(String argname, Map> args) { - List l = args.get(argname); - args.remove(argname); - if ( l == null ) return null; - - if ( l.size() > 1 ) { - System.err.println("Only one "+argname+" argument is allowed"); - System.exit(1); - } - if ( l.get(0).equals("true") ) return Boolean.valueOf(true); - if ( l.get(0).equals("false") ) return Boolean.valueOf(false); - System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed"); - System.exit(1); - return Boolean.valueOf(false); // This value isn't used because it is preceded by System.exit(1) - } - -/* ############################################## - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend, boolean runOld ) { - w_match = match; - w_mismatch = mismatch; - w_open = open; - w_extend = extend; - if ( runOld ) align_old(seq1,seq2); - else align(seq1,seq2); - } - - public SWPairwiseAlignment(byte[] seq1, byte[] seq2, boolean runOld) { - this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0,runOld); // match=1, mismatch = -1/3, gap=-(1+k/3) - } - - public void align_old(final byte[] a, final byte[] b) { - final int n = a.length; - final int m = b.length; - double [] sw = new double[(n+1)*(m+1)]; - int [] btrack = new int[(n+1)*(m+1)]; - calculateMatrix_old(a, b, sw, btrack); - calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions) - } - - private void calculateMatrix_old(final byte[] a, final byte[] b, double [] sw, int [] btrack ) { - final int n = a.length+1; - final int m = b.length+1; - - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop - byte a_base = a[i-1]; // letter in a at the current pos - - final int row_offset = row_offset_1 + m; - - // On the entrance into the loop, row_offset_1 is the (linear) offset - // of the first element of row (i-1) and row_offset is the linear offset of the - // start of row i - - for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) { - - // data_offset_1 is linearized offset of element [i-1][j-1] - - final byte b_base = b[j-1]; // letter in b at the current pos - - // in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_diag = sw[data_offset_1] + wd(a_base,b_base); - int kd = 0; - - double step_down = 0; - - for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) { - // data_offset_k is linearized offset of element [i-k][j] - // in other words, trial = sw[i-k][j]+gap_penalty: - final double trial = sw[data_offset_k]+wk(k); - if ( step_down < trial ) { - step_down=trial; - kd = k; - } - } - - int ki = 0; - - // optimized "traversal" of all the matrix cells to the left of the current one (i.e. traversing - // all 'step right' events that would end in the current cell. The optimized code - // does exactly the same thing as the commented out loop below. IMPORTANT: - // the optimization works ONLY for linear w(k)=wopen+(k-1)*wextend!!!! - - double step_right = 0; - - for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) { - // data_offset is linearized offset of element [i][j-k] - // in other words, step_right=sw[i][j-k]+gap_penalty; - final double trial = sw[data_offset]+wk(k); - if ( step_right < trial ) { - step_right=trial; - ki = k; - } - } - - final int data_offset = row_offset + j; // linearized offset of element [i][j] - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[data_offset] = Math.max(0,step_down); - btrack[data_offset] = kd ; // positive=vertical - } else { - sw[data_offset] = Math.max(0,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } else { - // step_down <= step_right - if ( step_right > step_diag ) { - sw[data_offset] = Math.max(0,step_right); - btrack[data_offset] = -ki; // negative = horizontal - } else { - sw[data_offset] = Math.max(0,step_diag); - btrack[data_offset] = 0; // 0 = diagonal - } - } - -// sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - - // IMPORTANT, IMPORTANT, IMPORTANT: - // note that we update this (secondary) outer loop variable here, - // so that we DO NOT need to update it - // in the for() statement itself. - row_offset_1 = row_offset; - } -// print(sw,a,b); - } -##################### -END COMMENTED OUT SECTION -*/ - } diff --git a/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java new file mode 100644 index 000000000..a49d7e5e6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignmentMain.java @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Simple program to run SW performance test. + * + * // TODO -- should be replaced with Caliper before using again + * + * User: depristo + * Date: 2/28/13 + * Time: 4:54 PM + * To change this template use File | Settings | File Templates. + */ +public class SWPairwiseAlignmentMain { + // BELOW: main() method for testing; old implementations of the core methods are commented out below; +// uncomment everything through the end of the file if benchmarking of new vs old implementations is needed. + + public static void main(String argv[]) { +// String ref="CACGAGCATATGTGTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTATGTGTTCACATGCATGTGTGTCT"; +// String read = "GCATATGTTTACATGAATTTGTATTGCACATGTGTTTAATGCGAACACGTGTCATGTGTGTGTTCACATGCATGTG"; + + String ref = null; + String read = null; + + Map> args = processArgs(argv); + + List l = args.get("SEQ"); + args.remove("SEQ"); + if ( l == null ) { + System.err.println("SEQ argument is missing. Two input sequences must be provided"); + System.exit(1); + } + if ( l.size() != 2 ) { + System.err.println("Two input sequences (SEQ arguments) must be provided. Found "+l.size()+" instead"); + System.exit(1); + } + + ref = l.get(0); + read = l.get(1); + + Double m = extractSingleDoubleArg("MATCH",args); + Double mm = extractSingleDoubleArg("MISMATCH",args); + Double open = extractSingleDoubleArg("OPEN",args); + Double ext = extractSingleDoubleArg("EXTEND",args); + + Boolean reverse = extractSingleBooleanArg("REVERSE",args); + if ( reverse != null && reverse.booleanValue() == true ) { + ref = Utils.reverse(ref); + read = Utils.reverse(read); + } + + Boolean print_mat = extractSingleBooleanArg("PRINT_MATRIX",args); + Boolean cut = extractSingleBooleanArg("CUTOFF",args); + if ( cut != null ) SWPairwiseAlignment.cutoff = cut; + + if ( args.size() != 0 ) { + System.err.println("Unknown argument on the command line: "+args.keySet().iterator().next()); + System.exit(1); + } + + double w_match; + double w_mismatch; + double w_open; + double w_extend; + + w_match = (m == null ? 30.0 : m.doubleValue()); + w_mismatch = (mm == null ? -10.0 : mm.doubleValue()); + w_open = (open == null ? -10.0 : open.doubleValue()); + w_extend = (ext == null ? -2.0 : ext.doubleValue()); + + + SWPairwiseAlignment a = new SWPairwiseAlignment(ref.getBytes(),read.getBytes(),w_match,w_mismatch,w_open,w_extend); + + System.out.println("start="+a.getAlignmentStart2wrt1()+", cigar="+a.getCigar()+ + " length1="+ref.length()+" length2="+read.length()); + + + System.out.println(); + a.printAlignment(ref.getBytes(),read.getBytes()); + + System.out.println(); + if ( print_mat != null && print_mat == true ) { + print(a.SW,ref.getBytes(),read.getBytes()); + } + } + + private static void print(double[] s, byte[] a, byte[] b) { + int n = a.length+1; + int m = b.length+1; + System.out.print(" "); + for ( int j = 1 ; j < m ; j++) System.out.printf(" %5c",(char)b[j-1]) ; + System.out.println(); + + for ( int i = 0, row_offset = 0 ; i < n ; i++, row_offset+=m) { + if ( i > 0 ) System.out.print((char)a[i-1]); + else System.out.print(' '); + System.out.print(" "); + for ( int j = 0; j < m ; j++ ) { + System.out.printf(" %5.1f",s[row_offset+j]); + } + System.out.println(); + } + } + + + static Pair getArg(String prefix, String argv[], int i) { + String arg = null; + if ( argv[i].startsWith(prefix) ) { + arg = argv[i].substring(prefix.length()); + if( arg.length() == 0 ) { + i++; + if ( i < argv.length ) arg = argv[i]; + else { + System.err.println("No value found after " + prefix + " argument tag"); + System.exit(1); + } + } + i++; + } + return new Pair(arg,i); + } + + static Map> processArgs(String argv[]) { + Map> args = new HashMap>(); + + for ( int i = 0; i < argv.length ; i++ ) { + String arg = argv[i]; + int pos = arg.indexOf('='); + if ( pos < 0 ) { + System.err.println("Argument "+arg+" is not of the form ="); + System.exit(1); + } + String val = arg.substring(pos+1); + if ( val.length() == 0 ) { + // there was a space between '=' and the value + i++; + if ( i < argv.length ) val = argv[i]; + else { + System.err.println("No value found after " + arg + " argument tag"); + System.exit(1); + } + } + arg = arg.substring(0,pos); + + List l = args.get(arg); + if ( l == null ) { + l = new ArrayList(); + args.put(arg,l); + } + l.add(val); + } + return args; + } + + static Double extractSingleDoubleArg(String argname, Map> args) { + List l = args.get(argname); + args.remove(argname); + if ( l == null ) return null; + + if ( l.size() > 1 ) { + System.err.println("Only one "+argname+" argument is allowed"); + System.exit(1); + } + double d=0; + try { + d = Double.parseDouble(l.get(0)); + } catch ( NumberFormatException e) { + System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+")"); + System.exit(1); + } + System.out.println("Argument "+argname+" set to "+d); + return new Double(d); + } + + + static Boolean extractSingleBooleanArg(String argname, Map> args) { + List l = args.get(argname); + args.remove(argname); + if ( l == null ) return null; + + if ( l.size() > 1 ) { + System.err.println("Only one "+argname+" argument is allowed"); + System.exit(1); + } + if ( l.get(0).equals("true") ) return Boolean.valueOf(true); + if ( l.get(0).equals("false") ) return Boolean.valueOf(false); + System.err.println("Can not parse value provided for "+argname+" argument ("+l.get(0)+"); true/false are allowed"); + System.exit(1); + return Boolean.valueOf(false); // This value isn't used because it is preceded by System.exit(1) + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index d009ba5bc..45a2fa58d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -54,6 +54,17 @@ public class Utils { public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + /** + * Boolean xor operation. Only true if x != y. + * + * @param x a boolean + * @param y a boolean + * @return true if x != y + */ + public static boolean xor(final boolean x, final boolean y) { + return x != y; + } + /** * Calculates the optimum initial size for a hash table given the maximum number * of elements it will need to hold. The optimum size is the smallest size that diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index cc4fc6129..5e010db67 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -203,12 +203,32 @@ public class PerReadAlleleLikelihoodMap { */ @Ensures("result != null") public static Allele getMostLikelyAllele( final Map alleleMap ) { + return getMostLikelyAllele(alleleMap, null); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * If the difference between the most-likely allele and the next-most-likely allele is < INFORMATIVE_LIKELIHOOD_THRESHOLD + * then the most likely allele is set to "no call" + * + * @param alleleMap - a map from alleles to likelihoods + * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. + * this is useful for the case where you've selected a subset of the alleles that + * the reads have been computed for further analysis. If null totally ignored + * @return - the most likely allele, or NO_CALL if two or more alleles have likelihoods within INFORMATIVE_LIKELIHOOD_THRESHOLD + * of one another. By default empty allele maps will return NO_CALL, and allele maps with a single entry will return the + * corresponding key + */ + public static Allele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; for (final Map.Entry el : alleleMap.entrySet()) { + if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) + continue; + if (el.getValue() > maxLike) { prevMaxLike = maxLike; maxLike = el.getValue(); @@ -220,7 +240,6 @@ public class PerReadAlleleLikelihoodMap { return (maxLike - prevMaxLike > INFORMATIVE_LIKELIHOOD_THRESHOLD ? mostLikelyAllele : Allele.NO_CALL ); } - /** * Debug method to dump contents of object into string for display */ diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java new file mode 100644 index 000000000..46ffd43b6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * A haplotype bam writer that writes out all haplotypes as reads and then + * the alignment of reach read to its best match among the best haplotypes. + * + * Primarily useful for people working on the HaplotypeCaller method itself + * + * User: depristo + * Date: 2/22/13 + * Time: 1:50 PM + */ +class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { + public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) { + super(bamWriter); + } + + /** + * {@inheritDoc} + */ + @Override + public void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap) { + writeHaplotypesAsReads(haplotypes, new HashSet(bestHaplotypes), paddedReferenceLoc); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + + // next, output the interesting reads for each sample aligned against the appropriate haplotype + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java new file mode 100644 index 000000000..a33ed809a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.SAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Writes a BAM containing just the reads in stratifiedReadMap aligned to their + * most likely haplotype among all of the called haplotypes. + * + * Primarily useful for users of the HaplotypeCaller who want to better understand the + * support of their calls w.r.t. the reads. + * + * User: depristo + * Date: 2/22/13 + * Time: 1:50 PM + */ +class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { + public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) { + super(bamWriter); + } + + /** + * {@inheritDoc} + */ + @Override + public void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap) { + if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes + return; + + writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); + for ( final Haplotype haplotype : calledHaplotypes ) { + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + } + + // the set of all alleles that were actually called + final Set allelesOfCalledHaplotypes = alleleToHaplotypeMap.keySet(); + + // next, output the interesting reads for each sample aligned against one of the called haplotypes + for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { + for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { + if ( entry.getKey().getMappingQuality() > 0 ) { + final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); + if ( bestAllele != Allele.NO_CALL ) + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart()); + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java new file mode 100644 index 000000000..c0d3b38fa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -0,0 +1,282 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * A BAMWriter that aligns reads to haplotypes and emits their best alignments to a BAM file + * + * User: depristo + * Date: 2/22/13 + * Time: 2:59 PM + */ +public abstract class HaplotypeBAMWriter { + /** + * Allows us to write out unique names for our synthetic haplotype reads + */ + private long uniqueNameCounter = 1; + + protected final static String READ_GROUP_ID = "ArtificialHaplotype"; + protected final static String HAPLOTYPE_TAG = "HC"; + + final SAMFileWriter bamWriter; + final SAMFileHeader bamHeader; + + /** + * Possible modes for writing haplotypes to BAMs + */ + public static enum Type { + /** + * A mode that's for method developers. Writes out all of the possible + * haplotypes considered, as well as reads aligned to each + */ + ALL_POSSIBLE_HAPLOTYPES, + + /** + * A mode for users. Writes out the reads aligned only to the called + * haplotypes. Useful to understand why the caller is calling what it is + */ + CALLED_HAPLOTYPES + } + + /** + * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer + * + * @param type the type of the writer we want to create + * @param stingSAMWriter the destination, must not be null + * @param header the header of the input BAMs used to make calls, must not be null + * @return a new HaplotypeBAMWriter + */ + public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) { + if ( header == null ) throw new IllegalArgumentException("header cannot be null"); + if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null"); + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + // prepare the bam header + final SAMFileHeader bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(header.getSequenceDictionary()); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + // include the original read groups plus a new artificial one for the haplotypes + final List readGroups = new ArrayList(header.getReadGroups()); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + + // TODO -- this will be a performance problem at high-scale + stingSAMWriter.setPresorted(false); + stingSAMWriter.writeHeader(bamHeader); + return create(type, stingSAMWriter); + } + + /** + * Create a new HaplotypeBAMWriter of type writing SAMRecords to writer + * + * Note that writer must have its presorted bit set to false, as reads + * may come in out of order during writing + * + * @param type the type of the writer we want to create + * @param writer the destination, must not be null + * @return a new HaplotypeBAMWriter + */ + public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) { + if ( writer == null ) throw new IllegalArgumentException("writer cannot be null"); + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + switch ( type ) { + case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer); + case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer); + default: throw new IllegalArgumentException("Unknown type " + type); + } + } + + /** + * Create a new HaplotypeBAMWriter writing its output to bamWriter + * + * Assumes that the header has been fully initialized with a single + * read group READ_GROUP_ID + * + * @param bamWriter our output destination + */ + protected HaplotypeBAMWriter(SAMFileWriter bamWriter) { + this.bamWriter = bamWriter; + this.bamHeader = bamWriter.getFileHeader(); + } + + /** + * Write out a BAM representing for the haplotype caller at this site + * + * @param haplotypes a list of all possible haplotypes at this loc + * @param paddedReferenceLoc the span of the based reference here + * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping + * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference + * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes + */ + public abstract void writeReadsAlignedToHaplotypes(final List haplotypes, + final GenomeLoc paddedReferenceLoc, + final List bestHaplotypes, + final Set calledHaplotypes, + final Map stratifiedReadMap); + + /** + * Write out read aligned to haplotype to the BAM file + * + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + */ + protected void writeReadAgainstHaplotype(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart) { + final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart); + if ( alignedToRef != null ) + bamWriter.addAlignment(alignedToRef); + } + + /** + * Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference + * via the alignment of haplotype (via its getCigar) method. + * + * @param originalRead the read we want to write aligned to the reference genome + * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference + * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. + * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible + */ + protected GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, + final Haplotype haplotype, + final int referenceStart) { + if ( originalRead == null ) throw new IllegalArgumentException("originalRead cannot be null"); + if ( haplotype == null ) throw new IllegalArgumentException("haplotype cannot be null"); + if ( haplotype.getCigar() == null ) throw new IllegalArgumentException("Haplotype cigar not set " + haplotype); + if ( referenceStart < 1 ) throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart); + + try { + // compute the smith-waterman alignment of read -> haplotype + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), 5.0, -10.0, -22.0, -1.2); + //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); + if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) + // sw can fail (reasons not clear) so if it happens just don't write the read + return null; + final Cigar swCigar = AlignmentUtils.consolidateCigar(swPairwiseAlignment.getCigar()); + + // since we're modifying the read we need to clone it + final GATKSAMRecord read = (GATKSAMRecord)originalRead.clone(); + + addHaplotypeTag(read, haplotype); + + // compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar + final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000); + final int readStartOnHaplotype = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1()); + final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype; + read.setAlignmentStart(readStartOnReference); + + // compute the read -> ref alignment by mapping read -> hap -> ref from the + // SW of read -> hap mapped through the given by hap -> ref + final Cigar haplotypeToRef = AlignmentUtils.trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1); + final Cigar readToRefCigarRaw = AlignmentUtils.applyCigarToCigar(swCigar, haplotypeToRef); + final Cigar readToRefCigarClean = AlignmentUtils.cleanUpCigar(readToRefCigarRaw); + final Cigar readToRefCigar = AlignmentUtils.leftAlignIndel(readToRefCigarClean, haplotype.getBases(), + originalRead.getReadBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true); + + read.setCigar(readToRefCigar); + + if ( readToRefCigar.getReadLength() != read.getReadLength() ) + throw new IllegalStateException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() + + " != read length " + read.getReadLength() + " for read " + read.format() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() + + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength()); + + return read; + } catch ( CloneNotSupportedException e ) { + throw new IllegalStateException("GATKSAMRecords should support clone but this one does not " + originalRead); + } + } + + /** + * Add a haplotype tag to the read based on haplotype + * + * @param read the read to add the tag to + * @param haplotype the haplotype that gives rises to read + */ + private void addHaplotypeTag(final GATKSAMRecord read, final Haplotype haplotype) { + // add a tag to the read that indicates which haplotype it best aligned to. It's a uniquish integer + read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode()); + } + + /** + * Write out haplotypes as reads to the BAM, marking specifically those that are among the best haplotypes + * + * @param haplotypes a collection of haplotypes to write to the BAM + * @param bestHaplotypes a subset of haplotypes that contains those that are best "either good or called" + * @param paddedReferenceLoc the genome loc of the padded reference + */ + protected void writeHaplotypesAsReads(final Collection haplotypes, + final Set bestHaplotypes, + final GenomeLoc paddedReferenceLoc) { + for ( final Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); + } + + /** + * Write out a representation of this haplotype as a read + * + * @param haplotype a haplotype to write out. Cannot be null + * @param paddedRefLoc the reference location. Cannot be null + * @param isAmongBestHaplotypes true if among the best haplotypes, false if it was just one possible but not so good + */ + private void writeHaplotype(final Haplotype haplotype, + final GenomeLoc paddedRefLoc, + final boolean isAmongBestHaplotypes) { + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); + record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + addHaplotypeTag(record, haplotype); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(paddedRefLoc.getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); + record.setFlags(16); + bamWriter.addAlignment(record); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index d34e2996c..d59d0ef63 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -31,18 +31,17 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.EventType; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; +import java.util.*; public final class AlignmentUtils { + private final static Logger logger = Logger.getLogger(AlignmentUtils.class); private final static EnumSet ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); private final static EnumSet ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S); @@ -58,6 +57,9 @@ public final class AlignmentUtils { return getMismatchCount(r, refSeq, refIndex).mismatchQualities; } + /** + * @see #getMismatchCount(GATKSAMRecord, byte[], int, int, int) with startOnRead == 0 and nReadBases == read.getReadLength() + */ public static MismatchCount getMismatchCount(GATKSAMRecord r, byte[] refSeq, int refIndex) { return getMismatchCount(r, refSeq, refIndex, 0, r.getReadLength()); } @@ -70,7 +72,10 @@ public final class AlignmentUtils { * * @param r the sam record to check against * @param refSeq the byte array representing the reference sequence - * @param refIndex the index in the reference byte array of the read's first base (the reference index is matching the alignment start, there may be tons of soft-clipped bases before/after that so it's wrong to compare with getReadLength() here.) + * @param refIndex the index in the reference byte array of the read's first base (the reference index + * is matching the alignment start, there may be tons of soft-clipped bases before/after + * that so it's wrong to compare with getReadLength() here.). Note that refIndex is + * zero based, not 1 based * @param startOnRead the index in the read's bases from which we start counting * @param nReadBases the number of bases after (but including) startOnRead that we check * @return non-null object representing the mismatch count @@ -440,6 +445,9 @@ public final class AlignmentUtils { * Need a well-formed, consolidated Cigar string so that the left aligning code works properly. * For example, 1M1M1M1D2M1M --> 3M1D3M * If the given cigar is empty then the returned cigar will also be empty + * + * Note that this routine collapses cigar elements of size 0, so 2M0M => 2M + * * @param c the cigar to consolidate * @return a non-null cigar with consecutive matching operators merged into single operators. */ @@ -450,13 +458,25 @@ public final class AlignmentUtils { final Cigar returnCigar = new Cigar(); int sumLength = 0; - for( int iii = 0; iii < c.numCigarElements(); iii++ ) { - sumLength += c.getCigarElement(iii).getLength(); - if( iii == c.numCigarElements() - 1 || !c.getCigarElement(iii).getOperator().equals(c.getCigarElement(iii+1).getOperator())) { // at the end so finish the current element - returnCigar.add(new CigarElement(sumLength, c.getCigarElement(iii).getOperator())); + CigarElement lastElement = null; + + for( final CigarElement cur : c.getCigarElements() ) { + if ( cur.getLength() == 0 ) + continue; // don't add elements of 0 length + + if ( lastElement != null && lastElement.getOperator() != cur.getOperator() ) { + returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); sumLength = 0; } + + sumLength += cur.getLength(); + lastElement = cur; } + + if( sumLength > 0 ) { + returnCigar.add(new CigarElement(sumLength, lastElement.getOperator())); + } + return returnCigar; } @@ -616,7 +636,7 @@ public final class AlignmentUtils { */ @Requires("c != null") @Ensures("result != null") - private static Cigar cleanUpCigar(final Cigar c) { + public static Cigar cleanUpCigar(final Cigar c) { final List elements = new ArrayList(c.numCigarElements() - 1); for (final CigarElement ce : c.getCigarElements()) { @@ -730,4 +750,355 @@ public final class AlignmentUtils { return alt; } + + + /** + * Trim cigar down to one that starts at start reference on the left and extends to end on the reference + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases on the reference? The first position is 0 + * @param end Where should we stop keeping bases on the reference? The maximum value is cigar.getReferenceLength() + * @return a new Cigar with reference length == start - end + 1 + */ + public static Cigar trimCigarByReference(final Cigar cigar, final int start, final int end) { + if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start); + if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start); + if ( end > cigar.getReferenceLength() ) throw new IllegalArgumentException("End is beyond the cigar's reference length " + end + " for cigar " + cigar ); + + final Cigar result = trimCigar(cigar, start, end, true); + + if ( result.getReferenceLength() != end - start + 1) + throw new IllegalStateException("trimCigarByReference failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result); + return result; + } + + /** + * Trim cigar down to one that starts at start base in the cigar and extends to (inclusive) end base + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases in the cigar? The first position is 0 + * @param end Where should we stop keeping bases in the cigar? The maximum value is cigar.getReadLength() + * @return a new Cigar containing == start - end + 1 reads + */ + public static Cigar trimCigarByBases(final Cigar cigar, final int start, final int end) { + if ( start < 0 ) throw new IllegalArgumentException("Start must be >= 0 but got " + start); + if ( end < start ) throw new IllegalArgumentException("End " + end + " is < start start " + start); + if ( end > cigar.getReadLength() ) throw new IllegalArgumentException("End is beyond the cigar's read length " + end + " for cigar " + cigar ); + + final Cigar result = trimCigar(cigar, start, end, false); + + final int expectedSize = end - start + 1; + if ( result.getReadLength() != expectedSize) + throw new IllegalStateException("trimCigarByBases failure: start " + start + " end " + end + " for " + cigar + " resulted in cigar with wrong size " + result + " with size " + result.getReadLength() + " expected " + expectedSize + " for input cigar " + cigar); + return result; + } + + + /** + * Workhorse for trimCigarByBases and trimCigarByReference + * + * @param cigar a non-null Cigar to trim down + * @param start Where should we start keeping bases in the cigar? The first position is 0 + * @param end Where should we stop keeping bases in the cigar? The maximum value is cigar.getReadLength() + * @param byReference should start and end be intrepreted as position in the reference or the read to trim to/from? + * @return a non-null cigar + */ + @Requires({"cigar != null", "start >= 0", "start <= end"}) + @Ensures("result != null") + private static Cigar trimCigar(final Cigar cigar, final int start, final int end, final boolean byReference) { + final List newElements = new LinkedList(); + + int pos = 0; + for ( final CigarElement elt : cigar.getCigarElements() ) { + if ( pos > end ) break; + + switch ( elt.getOperator() ) { + case D: + if ( ! byReference ) { + if ( pos >= start ) + newElements.add(elt); + break; + } + // otherwise fall through to the next case + case EQ: case M: case X: + pos = addCigarElements(newElements, pos, start, end, elt); + break; + case S: case I: + if ( byReference ) { + if ( pos >= start ) + newElements.add(elt); + } else { + pos = addCigarElements(newElements, pos, start, end, elt); + } + break; + default: + throw new IllegalStateException("Cannot handle " + elt); + } + } + + return AlignmentUtils.consolidateCigar(new Cigar(newElements)); + } + + /** + * Helper function for trimCigar that adds cigar elements (of total length X) of elt.op to dest for + * X bases that fall between start and end, where the last position of the base is pos. + * + * The primary use of this function is to create a new cigar element list that contains only + * elements that occur between start and end bases in an initial cigar. + * + * Note that this function may return multiple cigar elements (1M1M etc) that are best consolidated + * after the fact into a single simpler representation. + * + * @param dest we will append our cigar elements to this list + * @param pos the position (0 indexed) where elt started + * @param start only include bases that occur >= this position + * @param end only include bases that occur <= this position + * @param elt the element we are slicing down + * @return the position after we've traversed all elt.length bases of elt + */ + protected static int addCigarElements(final List dest, int pos, final int start, final int end, final CigarElement elt) { + final int length = Math.min(pos + elt.getLength() - 1, end) - Math.max(pos, start) + 1; + if ( length > 0 ) + dest.add(new CigarElement(length, elt.getOperator())); + return pos + elt.getLength(); + } + + /** + * Get the offset (base 0) of the first reference aligned base in Cigar that occurs after readStartByBaseOfCigar base of the cigar + * + * The main purpose of this routine is to find a good start position for a read given it's cigar. The real + * challenge is that the starting base might be inside an insertion, in which case the read actually starts + * at the next M/EQ/X operator. + * + * @param cigar a non-null cigar + * @param readStartByBaseOfCigar finds the first base after this (0 indexed) that aligns to the reference genome (M, EQ, X) + * @throws IllegalStateException if no such base can be found + * @return an offset into cigar + */ + public static int calcFirstBaseMatchingReferenceInCigar(final Cigar cigar, int readStartByBaseOfCigar) { + if ( cigar == null ) throw new IllegalArgumentException("cigar cannot be null"); + if ( readStartByBaseOfCigar >= cigar.getReadLength() ) throw new IllegalArgumentException("readStartByBaseOfCigar " + readStartByBaseOfCigar + " must be <= readLength " + cigar.getReadLength()); + + int hapOffset = 0, refOffset = 0; + for ( final CigarElement ce : cigar.getCigarElements() ) { + for ( int i = 0; i < ce.getLength(); i++ ) { + switch ( ce.getOperator() ) { + case M:case EQ:case X: + if ( hapOffset >= readStartByBaseOfCigar ) + return refOffset; + hapOffset++; + refOffset++; + break; + case I: case S: + hapOffset++; + break; + case D: + refOffset++; + break; + default: + throw new IllegalStateException("calcFirstBaseMatchingReferenceInCigar does not support cigar " + ce.getOperator() + " in cigar " + cigar); + } + } + } + + throw new IllegalStateException("Never found appropriate matching state for cigar " + cigar + " given start of " + readStartByBaseOfCigar); + } + + /** + * Generate a new Cigar that maps the operations of the first cigar through those in a second + * + * For example, if first is 5M and the second is 2M1I2M then the result is 2M1I2M. + * However, if first is 1M2D3M and second is 2M1I3M this results in a cigar X + * + * ref : AC-GTA + * hap : ACxGTA - 2M1I3M + * read : A--GTA - 1M2D3M + * result: A--GTA => 1M1D3M + * + * ref : ACxG-TA + * hap : AC-G-TA - 2M1D3M + * read : AC-GxTA - 3M1I2M + * result: AC-GxTA => 2M1D1M1I2M + * + * ref : ACGTA + * hap : ACGTA - 5M + * read : A-GTA - 1M1I3M + * result: A-GTA => 1M1I3M + * + * ref : ACGTAC + * hap : AC---C - 2M3D1M + * read : AC---C - 3M + * result: AG---C => 2M3D + * + * The constraint here is that both cigars should imply that the result have the same number of + * reference bases (i.e.g, cigar.getReferenceLength() are equals). + * + * @param firstToSecond the cigar mapping hap1 -> hap2 + * @param secondToThird the cigar mapping hap2 -> hap3 + * @return A cigar mapping hap1 -> hap3 + */ + public static Cigar applyCigarToCigar(final Cigar firstToSecond, final Cigar secondToThird) { + final boolean DEBUG = false; + + final List newElements = new LinkedList(); + final int nElements12 = firstToSecond.getCigarElements().size(); + final int nElements23 = secondToThird.getCigarElements().size(); + + int cigar12I = 0, cigar23I = 0; + int elt12I = 0, elt23I = 0; + + while ( cigar12I < nElements12 && cigar23I < nElements23 ) { + final CigarElement elt12 = firstToSecond.getCigarElement(cigar12I); + final CigarElement elt23 = secondToThird.getCigarElement(cigar23I); + + final CigarPairTransform transform = getTransformer(elt12.getOperator(), elt23.getOperator()); + + if ( DEBUG ) + System.out.printf("Transform %s => %s with elt1 = %d %s @ %d elt2 = %d %s @ %d with transform %s%n", + firstToSecond, secondToThird, cigar12I, elt12.getOperator(), elt12I, cigar23I, elt23.getOperator(), elt23I, transform); + + if ( transform.op13 != null ) // skip no ops + newElements.add(new CigarElement(1, transform.op13)); + + elt12I += transform.advance12; + elt23I += transform.advance23; + + // if have exhausted our current element, advance to the next one + if ( elt12I == elt12.getLength() ) { cigar12I++; elt12I = 0; } + if ( elt23I == elt23.getLength() ) { cigar23I++; elt23I = 0; } + } + + return AlignmentUtils.consolidateCigar(new Cigar(newElements)); + } + + private static CigarPairTransform getTransformer(final CigarOperator op12, final CigarOperator op23) { + for ( final CigarPairTransform transform : cigarPairTransformers) { + if ( transform.op12.contains(op12) && transform.op23.contains(op23) ) + return transform; + } + + throw new IllegalStateException("No transformer for operators " + op12 + " and " + op23); + } + + /** + * transformations that project one alignment state through another + * + * Think about this as a state machine, where we have: + * + * bases3 : xxx A zzz + * bases2 : xxx B zzz + * bases1 : xxx C zzz + * + * where A, B and C are alignment states of a three way alignment. We want to capture + * the transition from operation mapping 1 -> 2 and an operation mapping 2 -> 3 and its + * associated mapping from 1 -> 3 and the advancement of the cigar states of 1->2 and 2->3. + * + * Imagine that A, B, and C are all equivalent (so that op12 = M and op23 = M). This implies + * a mapping of 1->3 of M, and in this case the next states to consider in the 3 way alignment + * are the subsequent states in 1 and 2 (so that advance12 and advance23 are both 1). + * + * Obviously not all of the states and their associated transitions are so simple. Suppose instead + * that op12 = I, and op23 = M. What does this look like: + * + * bases3 : xxx - A zzz + * bases2 : xxx - B zzz + * bases1 : xxx I C zzz + * + * It means that op13 must be an insertion (as we have an extra base in 1 thats not present in 2 and + * so not present in 3). We advance the cigar in 1 by 1 (as we've consumed one base in 1 for the I) + * but we haven't yet found the base corresponding to the M of op23. So we don't advance23. + */ + private static class CigarPairTransform { + private final EnumSet op12, op23; + private final CigarOperator op13; + private final int advance12, advance23; + + private CigarPairTransform(CigarOperator op12, CigarOperator op23, CigarOperator op13, int advance12, int advance23) { + this.op12 = getCigarSet(op12); + this.op23 = getCigarSet(op23); + this.op13 = op13; + this.advance12 = advance12; + this.advance23 = advance23; + } + + private static EnumSet getCigarSet(final CigarOperator masterOp) { + switch ( masterOp ) { + case M: return EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X); + case I: return EnumSet.of(CigarOperator.I, CigarOperator.S); + case D: return EnumSet.of(CigarOperator.D); + default: throw new IllegalStateException("Unexpected state " + masterOp); + } + } + + @Override + public String toString() { + return "CigarPairTransform{" + + "op12=" + op12 + + ", op23=" + op23 + + ", op13=" + op13 + + ", advance12=" + advance12 + + ", advance23=" + advance23 + + '}'; + } + } + + + private final static List cigarPairTransformers = Arrays.asList( + // + // op12 is a match + // + // 3: xxx B yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.M, CigarOperator.M, 1, 1), + // 3: xxx I yyy + // ^^^^^^^^^^^^ + // 2: xxx I yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.I, CigarOperator.I, 1, 1), + // 3: xxx D yyy + // ^^^^^^^^^^^^ + // 2: xxx D yyy + // 1: xxx M yyy + new CigarPairTransform(CigarOperator.M, CigarOperator.D, CigarOperator.D, 0, 1), + + // + // op12 is a deletion + // + // 3: xxx D M yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx D yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.M, CigarOperator.D, 1, 1), + // 3: xxx D1 D2 yyy + // ^^^^^^^^^^^^ + // 2: xxx D2 yyy + // 1: xxx D1 yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.D, CigarOperator.D, 1, 0), + // 3: xxx X yyy => no-op, we skip emitting anything here + // ^^^^^^^^^^^^ + // 2: xxx I yyy + // 1: xxx D yyy + new CigarPairTransform(CigarOperator.D, CigarOperator.I, null, 1, 1), + + // + // op12 is a insertion + // + // 3: xxx I M yyy + // ^^^^^^^^^^^^ + // 2: xxx M yyy + // 1: xxx I yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.M, CigarOperator.I, 1, 0), + // 3: xxx I D yyy + // ^^^^^^^^^^^^ + // 2: xxx D yyy + // 1: xxx I yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.D, CigarOperator.I, 1, 0), + // 3: xxx I1 I2 yyy + // ^^^^^^^^^^^^ + // 2: xxx I2 yyy + // 1: xxx I1 yyy + new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0) + ); } diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java index 1b16266a9..0e4ec2b63 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -26,9 +26,11 @@ package org.broadinstitute.sting.utils; +import net.sf.picard.util.CigarUtil; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -163,4 +165,22 @@ public class HaplotypeUnitTest extends BaseTest { final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); } + + private Haplotype makeHCForCigar(final String bases, final String cigar) { + final Haplotype h = new Haplotype(bases.getBytes()); + h.setCigar(TextCigarCodec.getSingleton().decode(cigar)); + return h; + } + + @Test + public void testConsolidateCigar() throws Exception { + Assert.assertEquals(makeHCForCigar("AGCT", "4M").getConsolidatedPaddedCigar(0).toString(), "4M"); + Assert.assertEquals(makeHCForCigar("AGCT", "4M").getConsolidatedPaddedCigar(1).toString(), "5M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(0).toString(), "1M2I1M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(1).toString(), "1M2I2M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1M").getConsolidatedPaddedCigar(2).toString(), "1M2I3M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(0).toString(), "1M3I"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(1).toString(), "1M3I1M"); + Assert.assertEquals(makeHCForCigar("AGCT", "1M1I1I1I").getConsolidatedPaddedCigar(2).toString(), "1M3I2M"); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index 29c643153..705db6f85 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -75,6 +75,14 @@ public class UtilsUnitTest extends BaseTest { Assert.assertEquals(duped.charAt(0), 'b', "dupString character was incorrect"); } + @Test + public void testXor() { + Assert.assertEquals(Utils.xor(false, false), false, "xor F F failed"); + Assert.assertEquals(Utils.xor(false, true), true, "xor F T failed"); + Assert.assertEquals(Utils.xor(true, false), true, "xor T F failed"); + Assert.assertEquals(Utils.xor(true, true), false, "xor T T failed"); + } + @Test public void testDupStringMultiChar() { String duped = Utils.dupString('c',5); diff --git a/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java new file mode 100644 index 000000000..43969c7a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -0,0 +1,287 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeBAMWriterUnitTest extends BaseTest { + private final static boolean DEBUG = false; + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + private GATKSAMRecord makeRead(final String baseString) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, 10); + final byte[] bases = baseString.getBytes(); + read.setReadBases(bases.clone()); + read.setBaseQualities(Utils.dupBytes((byte)30, read.getReadLength())); + return read; + } + + private Haplotype makeHaplotype(final String bases, final String cigar) { + final Haplotype hap = new Haplotype(bases.getBytes()); + hap.setCigar(TextCigarCodec.getSingleton().decode(cigar)); + return hap; + } + + private static class MockBAMWriter implements SAMFileWriter { + @Override + public void addAlignment(SAMRecord alignment) { + //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public SAMFileHeader getFileHeader() { + return null; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public void close() { + //To change body of implemented methods use File | Settings | File Templates. + } + } + + @Test + public void testCreate() throws Exception { + final SAMFileWriter writer = new MockBAMWriter(); + Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, writer) instanceof CalledHaplotypeBAMWriter); + Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, writer) instanceof AllHaplotypeBAMWriter); + } + + + ////////////////////////////////////////// + // Test HaplotypeBAMWriter.createReadAlignedToRef() // + ////////////////////////////////////////// + + @DataProvider(name = "ReadAlignedToRefData") + public Object[][] makeReadAlignedToRefData() { + List tests = new ArrayList(); + + final String hapBases = "ACTGAAGGTTCC"; + final Haplotype allM = makeHaplotype(hapBases, hapBases.length() + "M"); + + // make sure we get back a cigar of the right length + for ( int i = -1; i < hapBases.length(); i++ ) { + final GATKSAMRecord read = makeRead(hapBases); + if ( i != -1 ) read.getReadBases()[i] = (byte)'A'; + tests.add(new Object[]{read, allM, 10, 10, allM.getCigar().toString()}); + } + + // make sure insertions at the front are correctly handled + for ( int padFront = 1; padFront < 10; padFront++ ) { + final GATKSAMRecord read = makeRead(Utils.dupString("N", padFront) + hapBases); + tests.add(new Object[]{read, allM, 10, 10, padFront + "I" + allM.getCigar().toString()}); + } + + // make sure insertions at the back are correctly handled + for ( int padBack = 1; padBack < 10; padBack++ ) { + final GATKSAMRecord read = makeRead(hapBases + Utils.dupString("N", padBack)); + tests.add(new Object[]{read, allM, 10, 10, allM.getCigar().toString() + padBack + "I"}); + } + + // make sure refStart and hapStart are respected + for ( int refStart = 1; refStart < 10; refStart++ ) { + for ( int hapStart = refStart; hapStart < 10 + refStart; hapStart++ ) { + final Haplotype hap = new Haplotype(allM.getBases()); + hap.setCigar(allM.getCigar()); + hap.setAlignmentStartHapwrtRef(hapStart); + + final GATKSAMRecord read = makeRead(new String(hap.getBases())); + tests.add(new Object[]{read, hap, refStart, refStart + hapStart, allM.getCigar().toString()}); + } + } + + // test that reads without a good alignment to hap get excluded + { + final GATKSAMRecord read = makeRead("NNNNN"); + tests.add(new Object[]{read, allM, 10, -1, null}); + } + + // example case of bad alignment because SW doesn't necessarily left-align indels + { + final String hap = "ACTGTGGGTTCCTCTTATTTTATTTCTACATCAATGTTCATATTTAACTTATTATTTTATCTTATTTTTAAATTTCTTTTATGTTGAGCCTTGATGAAAGCCATAGGTTCTCTCATATAATTGTATGTGTATGTATGTATATGTACATAATATATACATATATGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATATATACATATATG"; + final String hapCigar = "399M"; + final String readBases = "ATGTACATAATATATACATATATGTATATGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTACATAATATATACGTATATGTATGTGTATGTGTATTACATAATATATACATATATGTATATATTATGTATATGTACATAATAT"; + final GATKSAMRecord read = makeRead(readBases); + final int refStart = 10130100; + final int hapStart = 500; + final String badCigar = "31M6D211M"; + final String goodCigar = "28M6D214M"; + final Haplotype badHap = new Haplotype(hap.getBytes()); + badHap.setCigar(TextCigarCodec.getSingleton().decode(hapCigar)); + badHap.setAlignmentStartHapwrtRef(hapStart); + + final int expectedPos = 10130740; + tests.add(new Object[]{read, badHap, refStart, expectedPos, goodCigar}); + } + + return tests.toArray(new Object[][]{}); + } + + + + @Test(dataProvider = "ReadAlignedToRefData", enabled = true) + public void testReadAlignedToRef(final GATKSAMRecord read, final Haplotype haplotype, final int refStart, final int expectedReadStart, final String expectedReadCigar) throws Exception { + final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter()); + final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone(); + + if ( expectedReadCigar == null ) { + Assert.assertNull(writer.createReadAlignedToRef(read, haplotype, refStart)); + } else { + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedReadCigar); + final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, refStart); + + Assert.assertEquals(alignedRead.getReadName(), originalReadCopy.getReadName()); + Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart); + Assert.assertEquals(alignedRead.getReadBases(), originalReadCopy.getReadBases()); + Assert.assertEquals(alignedRead.getBaseQualities(), originalReadCopy.getBaseQualities()); + Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart); + Assert.assertEquals(alignedRead.getCigar(), expectedCigar); + Assert.assertNotNull(alignedRead.getAttribute("HC")); + } + + Assert.assertEquals(read, originalReadCopy, "createReadAlignedToRef seems be modifying the original read!"); + } + + private static class Mutation implements Comparable { + int pos, len; + CigarOperator operator; + + private Mutation(int pos, int len, CigarOperator operator) { + this.pos = pos; + this.len = len; + this.operator = operator; + } + public int getNMismatches() { return len; } + + @Override + public int compareTo(Mutation o) { + return Integer.valueOf(pos).compareTo(o.pos); + } + + private String apply(final String seq) { + switch ( operator ) { + case M: + final byte[] bases = seq.getBytes(); + if ( pos < seq.length() ) + bases[pos] = (byte)(bases[pos] == 'A' ? 'C' : 'A'); + return new String(bases); + case I: { + final String prefix = seq.substring(0, pos); + final String postfix = seq.substring(pos, seq.length()); + return prefix + "GTCAGTTA".substring(0, len) + postfix; + } case D: { + final String prefix = seq.substring(0, pos); + final String postfix = seq.substring(pos + len, seq.length()); + return prefix + postfix; + }default: + throw new IllegalStateException("Unexpected operator " + operator); + } + } + } + + private static class MutatedSequence { + int numMismatches; + String seq; + + private MutatedSequence(int numMismatches, String seq) { + this.numMismatches = numMismatches; + this.seq = seq; + } + } + + private MutatedSequence mutateSequence(final String hapIn, final List mutations) { + Collections.sort(mutations); + int mismatches = 0; + String hap = hapIn; + for ( final Mutation mut : mutations ) { + hap = mut.apply(hap); + mismatches += mut.getNMismatches(); + } + return new MutatedSequence(mismatches, hap); + } + + @DataProvider(name = "ComplexReadAlignedToRef") + public Object[][] makeComplexReadAlignedToRef() { + List tests = new ArrayList(); + + final List allMutations = Arrays.asList( + new Mutation(1, 1, CigarOperator.M), + new Mutation(2, 1, CigarOperator.M), + new Mutation(3, 1, CigarOperator.I), + new Mutation(7, 1, CigarOperator.D) + ); + + int i = 0; + final String referenceBases = "ACTGACTGACTG"; + final String paddedReference = "NNNN" + referenceBases + "NNNN"; + for ( final List mutations : Utils.makePermutations(allMutations, 3, false) ) { + final MutatedSequence hap = mutateSequence(referenceBases, mutations); + final Haplotype haplotype = new Haplotype(hap.seq.getBytes()); + final SWPairwiseAlignment align = new SWPairwiseAlignment(paddedReference.getBytes(), hap.seq.getBytes()); + haplotype.setAlignmentStartHapwrtRef(align.getAlignmentStart2wrt1()); + haplotype.setCigar(align.getCigar()); + + for ( final List readMutations : Utils.makePermutations(allMutations, 3, false) ) { + final MutatedSequence readBases = mutateSequence(hap.seq, readMutations); + final GATKSAMRecord read = makeRead(readBases.seq); + tests.add(new Object[]{i++, read, paddedReference, haplotype, hap.numMismatches + readBases.numMismatches}); + } + } + + // for convenient testing of a single failing case + //tests.add(new Object[]{makeRead("ACCGGGACTGACTG"), reference, makeHaplotype("AAAGGACTGACTG", "1M1I11M"), 2}); + + return tests.toArray(new Object[][]{}); + } + + + @Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG) + public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception { + final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter()); + final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, 1); + if ( alignedRead != null ) { + final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches; + Assert.assertTrue(mismatches <= expectedMaxMismatches, + "Alignment of read to ref looks broken. Expected at most " + expectedMaxMismatches + " but saw " + mismatches + + " for readBases " + new String(read.getReadBases()) + " with cigar " + read.getCigar() + " reference " + reference + " haplotype " + + haplotype + " with cigar " + haplotype.getCigar() + " aligned read cigar " + alignedRead.getCigarString() + " @ " + alignedRead.getAlignmentStart()); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java index f845e6670..ae01c6c63 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/AlignmentUtilsUnitTest.java @@ -37,6 +37,7 @@ import org.testng.annotations.Test; import java.util.*; public class AlignmentUtilsUnitTest { + private final static boolean DEBUG = false; private SAMFileHeader header; /** Basic aligned and mapped read. */ @@ -85,7 +86,7 @@ public class AlignmentUtilsUnitTest { new Object[] {readUnknownStart, false} }; } - @Test(dataProvider = "genomeLocUnmappedReadTests") + @Test(enabled = !DEBUG, dataProvider = "genomeLocUnmappedReadTests") public void testIsReadGenomeLocUnmapped(SAMRecord read, boolean expected) { Assert.assertEquals(AlignmentUtils.isReadGenomeLocUnmapped(read), expected); } @@ -103,7 +104,7 @@ public class AlignmentUtilsUnitTest { new Object[] {readUnknownStart, true} }; } - @Test(dataProvider = "unmappedReadTests") + @Test(enabled = !DEBUG, dataProvider = "unmappedReadTests") public void testIsReadUnmapped(SAMRecord read, boolean expected) { Assert.assertEquals(AlignmentUtils.isReadUnmapped(read), expected); } @@ -160,7 +161,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumAlignedBasesCountingSoftClips") + @Test(enabled = !DEBUG, dataProvider = "NumAlignedBasesCountingSoftClips") public void testNumAlignedBasesCountingSoftClips(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); @@ -180,7 +181,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "CigarHasZeroElement") + @Test(enabled = !DEBUG, dataProvider = "CigarHasZeroElement") public void testCigarHasZeroSize(final Cigar cigar, final boolean hasZero) { Assert.assertEquals(AlignmentUtils.cigarHasZeroSizeElement(cigar), hasZero, "Cigar " + cigar.toString() + " failed cigarHasZeroSizeElement"); } @@ -200,7 +201,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumHardClipped") + @Test(enabled = !DEBUG, dataProvider = "NumHardClipped") public void testNumHardClipped(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); @@ -227,49 +228,54 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "NumAlignedBlocks") + @Test(enabled = !DEBUG, dataProvider = "NumAlignedBlocks") public void testNumAlignedBlocks(final Cigar cigar, final int expected) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength()); read.setCigar(cigar); Assert.assertEquals(AlignmentUtils.getNumAlignmentBlocks(read), expected, "Cigar " + cigar + " failed NumAlignedBlocks"); } - @Test - public void testConsolidateCigar() { - { - //1M1M1M1D2M1M --> 3M1D3M - List list = new ArrayList(); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.D)); - list.add( new CigarElement(2, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.M)); - Cigar unconsolidatedCigar = new Cigar(list); + @DataProvider(name = "ConsolidateCigarData") + public Object[][] makeConsolidateCigarData() { + List tests = new ArrayList(); - list.clear(); - list.add( new CigarElement(3, CigarOperator.M)); - list.add( new CigarElement(1, CigarOperator.D)); - list.add( new CigarElement(3, CigarOperator.M)); - Cigar consolidatedCigar = new Cigar(list); + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{"1M1M", "2M"}); + tests.add(new Object[]{"2M", "2M"}); + tests.add(new Object[]{"2M0M", "2M"}); + tests.add(new Object[]{"0M2M", "2M"}); + tests.add(new Object[]{"0M2M0M0I0M1M", "3M"}); + tests.add(new Object[]{"2M0M1M", "3M"}); + tests.add(new Object[]{"1M1M1M1D2M1M", "3M1D3M"}); + tests.add(new Object[]{"6M6M6M", "18M"}); - Assert.assertEquals(consolidatedCigar.toString(), AlignmentUtils.consolidateCigar(unconsolidatedCigar).toString()); + final List elements = new LinkedList(); + int i = 1; + for ( final CigarOperator op : CigarOperator.values() ) { + elements.add(new CigarElement(i++, op)); + } + for ( final List ops : Utils.makePermutations(elements, 3, false) ) { + final String expected = new Cigar(ops).toString(); + final List cutElements = new LinkedList(); + for ( final CigarElement elt : ops ) { + for ( int j = 0; j < elt.getLength(); j++ ) { + cutElements.add(new CigarElement(1, elt.getOperator())); + } + } + + final String actual = new Cigar(cutElements).toString(); + tests.add(new Object[]{actual, expected}); } - { - //6M6M6M --> 18M - List list = new ArrayList(); - list.add( new CigarElement(6, CigarOperator.M)); - list.add( new CigarElement(6, CigarOperator.M)); - list.add( new CigarElement(6, CigarOperator.M)); - Cigar unconsolidatedCigar = new Cigar(list); + return tests.toArray(new Object[][]{}); + } - list.clear(); - list.add( new CigarElement(18, CigarOperator.M)); - Cigar consolidatedCigar = new Cigar(list); - - Assert.assertEquals(consolidatedCigar.toString(), AlignmentUtils.consolidateCigar(unconsolidatedCigar).toString()); - } + @Test(enabled = !DEBUG, dataProvider = "ConsolidateCigarData") + public void testConsolidateCigarWithData(final String testCigarString, final String expectedCigarString) { + final Cigar testCigar = TextCigarCodec.getSingleton().decode(testCigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.consolidateCigar(testCigar); + Assert.assertEquals(actualCigar, expectedCigar); } @DataProvider(name = "SoftClipsDataProvider") @@ -304,7 +310,7 @@ public class AlignmentUtilsUnitTest { return array; } - @Test(dataProvider = "SoftClipsDataProvider") + @Test(enabled = !DEBUG, dataProvider = "SoftClipsDataProvider") public void testSoftClipsData(final byte[] qualsOfSoftClipsOnLeft, final int middleSize, final String middleOp, final byte[] qualOfSoftClipsOnRight, final int qualThreshold, final int numExpected) { final int readLength = (middleOp.equals("D") ? 0 : middleSize) + qualOfSoftClipsOnRight.length + qualsOfSoftClipsOnLeft.length; final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLength); @@ -391,7 +397,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "MismatchCountDataProvider") + @Test(enabled = !DEBUG, dataProvider = "MismatchCountDataProvider") public void testMismatchCountData(final GATKSAMRecord read, final int refIndex, final int startOnRead, final int basesToRead, final boolean isMismatch) { final byte[] reference = Utils.dupBytes((byte)'A', 100); final int actual = AlignmentUtils.getMismatchCount(read, reference, refIndex, startOnRead, basesToRead).numMismatches; @@ -476,7 +482,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "AlignmentByteArrayOffsetDataProvider") + @Test(enabled = !DEBUG, dataProvider = "AlignmentByteArrayOffsetDataProvider") public void testAlignmentByteArrayOffsetData(final Cigar cigar, final int offset, final int expectedResult, final boolean isDeletion, final int lengthOfSoftClip) { final int actual = AlignmentUtils.calcAlignmentByteArrayOffset(cigar, isDeletion ? -1 : offset, isDeletion, 20, 20 + offset - lengthOfSoftClip); Assert.assertEquals(actual, expectedResult, "Wrong alignment offset detected for cigar " + cigar.toString()); @@ -514,7 +520,7 @@ public class AlignmentUtilsUnitTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ReadToAlignmentByteArrayDataProvider") + @Test(enabled = !DEBUG, dataProvider = "ReadToAlignmentByteArrayDataProvider") public void testReadToAlignmentByteArrayData(final Cigar cigar, final int expectedLength, final char middleOp, final int startOfIndelBases, final int lengthOfDeletion) { final byte[] read = Utils.dupBytes((byte)'A', cigar.getReadLength()); final byte[] alignment = AlignmentUtils.readToAlignmentByteArray(cigar, read); @@ -645,9 +651,273 @@ public class AlignmentUtilsUnitTest { return readString; } - @Test(dataProvider = "LeftAlignIndelDataProvider", enabled = true) + @Test(enabled = !DEBUG, dataProvider = "LeftAlignIndelDataProvider") public void testLeftAlignIndelData(final Cigar originalCigar, final Cigar expectedCigar, final byte[] reference, final byte[] read, final int repeatLength) { final Cigar actualCigar = AlignmentUtils.leftAlignIndel(originalCigar, reference, read, 0, 0, true); Assert.assertTrue(expectedCigar.equals(actualCigar), "Wrong left alignment detected for cigar " + originalCigar.toString() + " to " + actualCigar.toString() + " but expected " + expectedCigar.toString() + " with repeat length " + repeatLength); } + + ////////////////////////////////////////// + // Test AlignmentUtils.trimCigarByReference() // + ////////////////////////////////////////// + + @DataProvider(name = "TrimCigarData") + public Object[][] makeTrimCigarData() { + List tests = new ArrayList(); + + for ( final CigarOperator op : Arrays.asList(CigarOperator.D, CigarOperator.EQ, CigarOperator.X, CigarOperator.M) ) { + for ( int myLength = 1; myLength < 6; myLength++ ) { + for ( int start = 0; start < myLength - 1; start++ ) { + for ( int end = start; end < myLength; end++ ) { + final int length = end - start + 1; + + final List padOps = Arrays.asList(CigarOperator.D, CigarOperator.M); + for ( final CigarOperator padOp: padOps) { + for ( int leftPad = 0; leftPad < 2; leftPad++ ) { + for ( int rightPad = 0; rightPad < 2; rightPad++ ) { + tests.add(new Object[]{ + (leftPad > 0 ? leftPad + padOp.toString() : "") + myLength + op.toString() + (rightPad > 0 ? rightPad + padOp.toString() : ""), + start + leftPad, + end + leftPad, + length + op.toString()}); + } + } + } + } + } + } + } + + for ( final int leftPad : Arrays.asList(0, 1, 2, 5) ) { + for ( final int rightPad : Arrays.asList(0, 1, 2, 5) ) { + final int length = leftPad + rightPad; + if ( length > 0 ) { + for ( final int insSize : Arrays.asList(1, 10) ) { + for ( int start = 0; start <= leftPad; start++ ) { + for ( int stop = leftPad; stop < length; stop++ ) { + final int leftPadRemaining = leftPad - start; + final int rightPadRemaining = stop - leftPad + 1; + final String insC = insSize + "I"; + tests.add(new Object[]{ + leftPad + "M" + insC + rightPad + "M", + start, + stop, + (leftPadRemaining > 0 ? leftPadRemaining + "M" : "") + insC + (rightPadRemaining > 0 ? rightPadRemaining + "M" : "") + }); + } + } + } + } + } + } + + tests.add(new Object[]{"3M2D4M", 0, 8, "3M2D4M"}); + tests.add(new Object[]{"3M2D4M", 2, 8, "1M2D4M"}); + tests.add(new Object[]{"3M2D4M", 2, 6, "1M2D2M"}); + tests.add(new Object[]{"3M2D4M", 3, 6, "2D2M"}); + tests.add(new Object[]{"3M2D4M", 4, 6, "1D2M"}); + tests.add(new Object[]{"3M2D4M", 5, 6, "2M"}); + tests.add(new Object[]{"3M2D4M", 6, 6, "1M"}); + + tests.add(new Object[]{"2M3I4M", 0, 5, "2M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 5, "1M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 4, "1M3I3M"}); + tests.add(new Object[]{"2M3I4M", 2, 4, "3I3M"}); + tests.add(new Object[]{"2M3I4M", 2, 3, "3I2M"}); + tests.add(new Object[]{"2M3I4M", 2, 2, "3I1M"}); + tests.add(new Object[]{"2M3I4M", 3, 4, "2M"}); + tests.add(new Object[]{"2M3I4M", 3, 3, "1M"}); + tests.add(new Object[]{"2M3I4M", 4, 4, "1M"}); + + // this doesn't work -- but I'm not sure it should + // tests.add(new Object[]{"2M3I4M", 2, 1, "3I"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimCigarData", enabled = ! DEBUG) + public void testTrimCigar(final String cigarString, final int start, final int length, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.trimCigarByReference(cigar, start, length); + Assert.assertEquals(actualCigar, expectedCigar); + } + + @DataProvider(name = "TrimCigarByBasesData") + public Object[][] makeTrimCigarByBasesData() { + List tests = new ArrayList(); + + tests.add(new Object[]{"2M3I4M", 0, 8, "2M3I4M"}); + tests.add(new Object[]{"2M3I4M", 1, 8, "1M3I4M"}); + tests.add(new Object[]{"2M3I4M", 2, 8, "3I4M"}); + tests.add(new Object[]{"2M3I4M", 3, 8, "2I4M"}); + tests.add(new Object[]{"2M3I4M", 4, 8, "1I4M"}); + tests.add(new Object[]{"2M3I4M", 4, 7, "1I3M"}); + tests.add(new Object[]{"2M3I4M", 4, 6, "1I2M"}); + tests.add(new Object[]{"2M3I4M", 4, 5, "1I1M"}); + tests.add(new Object[]{"2M3I4M", 4, 4, "1I"}); + tests.add(new Object[]{"2M3I4M", 5, 5, "1M"}); + + tests.add(new Object[]{"2M2D2I", 0, 3, "2M2D2I"}); + tests.add(new Object[]{"2M2D2I", 1, 3, "1M2D2I"}); + tests.add(new Object[]{"2M2D2I", 2, 3, "2D2I"}); + tests.add(new Object[]{"2M2D2I", 3, 3, "1I"}); + tests.add(new Object[]{"2M2D2I", 2, 2, "2D1I"}); + tests.add(new Object[]{"2M2D2I", 1, 2, "1M2D1I"}); + tests.add(new Object[]{"2M2D2I", 1, 1, "1M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "TrimCigarByBasesData", enabled = !DEBUG) + public void testTrimCigarByBase(final String cigarString, final int start, final int length, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.trimCigarByBases(cigar, start, length); + Assert.assertEquals(actualCigar, expectedCigar); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.applyCigarToCigar() // + ////////////////////////////////////////// + + @DataProvider(name = "ApplyCigarToCigarData") + public Object[][] makeApplyCigarToCigarData() { + List tests = new ArrayList(); + + for ( int i = 1; i < 5; i++ ) + tests.add(new Object[]{i + "M", i + "M", i + "M"}); + +// * ref : ACGTAC +// * hap : AC---C - 2M3D1M +// * read : AC---C - 3M +// * result: AG---C => 2M3D + tests.add(new Object[]{"3M", "2M3D1M", "2M3D1M"}); + +// * ref : ACxG-TA +// * hap : AC-G-TA - 2M1D3M +// * read : AC-GxTA - 3M1I2M +// * result: AC-GxTA => 2M1D1M1I2M + tests.add(new Object[]{"3M1I2M", "2M1D3M", "2M1D1M1I2M"}); + +// * ref : A-CGTA +// * hap : A-CGTA - 5M +// * read : AxCGTA - 1M1I4M +// * result: AxCGTA => 1M1I4M + tests.add(new Object[]{"1M1I4M", "5M", "1M1I4M"}); + +// * ref : ACGTA +// * hap : ACGTA - 5M +// * read : A--TA - 1M2D2M +// * result: A--TA => 1M2D2M + tests.add(new Object[]{"1M2D2M", "5M", "1M2D2M"}); + +// * ref : AC-GTA +// * hap : ACxGTA - 2M1I3M +// * read : A--GTA - 1M2D3M +// * result: A--GTA => 1M1D3M + tests.add(new Object[]{"108M14D24M2M18I29M92M1000M", "2M1I3M", "2M1I3M"}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ApplyCigarToCigarData", enabled = !DEBUG) + public void testApplyCigarToCigar(final String firstToSecondString, final String secondToThirdString, final String expectedCigarString) { + final Cigar firstToSecond = TextCigarCodec.getSingleton().decode(firstToSecondString); + final Cigar secondToThird = TextCigarCodec.getSingleton().decode(secondToThirdString); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + final Cigar actualCigar = AlignmentUtils.applyCigarToCigar(firstToSecond, secondToThird); + Assert.assertEquals(actualCigar, expectedCigar); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.applyCigarToCigar() // + ////////////////////////////////////////// + + @DataProvider(name = "ReadOffsetFromCigarData") + public Object[][] makeReadOffsetFromCigarData() { + List tests = new ArrayList(); + + final int SIZE = 10; + for ( int i = 0; i < SIZE; i++ ) { + tests.add(new Object[]{SIZE + "M", i, i}); + } + + // 0123ii45 + // ref : ACGT--AC + // hap : AC--xxAC (2M2D2I2M) + // ref.pos: 01 45 + tests.add(new Object[]{"2M2D2I2M", 0, 0}); + tests.add(new Object[]{"2M2D2I2M", 1, 1}); + tests.add(new Object[]{"2M2D2I2M", 2, 4}); + tests.add(new Object[]{"2M2D2I2M", 3, 4}); + tests.add(new Object[]{"2M2D2I2M", 4, 4}); + tests.add(new Object[]{"2M2D2I2M", 5, 5}); + + // 10132723 - 10132075 - 500 = 148 + // what's the offset of the first match after the I? + // 108M + 14D + 24M + 2M = 148 + // What's the offset of the first base that is after the I? + // 108M + 24M + 2M + 18I = 134M + 18I = 152 - 1 = 151 + tests.add(new Object[]{"108M14D24M2M18I29M92M", 0, 0}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 107, 107}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 108, 108 + 14}); // first base after the deletion + + tests.add(new Object[]{"108M14D24M2M18I29M92M", 132, 132+14}); // 2 before insertion + tests.add(new Object[]{"108M14D24M2M18I29M92M", 133, 133+14}); // last base before insertion + + // entering into the insertion + for ( int i = 0; i < 18; i++ ) { + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+i, 148}); // inside insertion + } + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18, 148}); // first base after insertion matches at same as insertion + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18+1, 149}); + tests.add(new Object[]{"108M14D24M2M18I29M92M", 134+18+2, 150}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadOffsetFromCigarData", enabled = !DEBUG) + public void testReadOffsetFromCigar(final String cigarString, final int startOnCigar, final int expectedOffset) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final int actualOffset = AlignmentUtils.calcFirstBaseMatchingReferenceInCigar(cigar, startOnCigar); + Assert.assertEquals(actualOffset, expectedOffset); + } + + ////////////////////////////////////////// + // Test AlignmentUtils.addCigarElements() // + ////////////////////////////////////////// + + @DataProvider(name = "AddCigarElementsData") + public Object[][] makeAddCigarElementsData() { + List tests = new ArrayList(); + + final int SIZE = 10; + for ( final CigarOperator op : Arrays.asList(CigarOperator.I, CigarOperator.M, CigarOperator.S, CigarOperator.EQ, CigarOperator.X)) { + for ( int start = 0; start < SIZE; start++ ) { + for ( int end = start; end < SIZE * 2; end ++ ) { + for ( int pos = 0; pos < SIZE * 3; pos++ ) { + int length = 0; + for ( int i = 0; i < SIZE; i++ ) length += (i+pos) >= start && (i+pos) <= end ? 1 : 0; + tests.add(new Object[]{SIZE + op.toString(), pos, start, end, length > 0 ? length + op.toString() : "*"}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "AddCigarElementsData", enabled = !DEBUG) + public void testAddCigarElements(final String cigarString, final int pos, final int start, final int end, final String expectedCigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + final CigarElement elt = cigar.getCigarElement(0); + final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedCigarString); + + final List elts = new LinkedList(); + final int actualEndPos = AlignmentUtils.addCigarElements(elts, pos, start, end, elt); + + Assert.assertEquals(actualEndPos, pos + elt.getLength()); + Assert.assertEquals(AlignmentUtils.consolidateCigar(new Cigar(elts)), expectedCigar); + } }