From fddca032bbbbf4759eb1403c8e7e5a9ae49fa220 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 27 Jan 2010 21:36:42 +0000 Subject: [PATCH] Initial commit of v2.0 of the cleaner. DO NOT USE. (this means you, Chris) Cleaned up SW code and started moving over everything to use byte[] instead of String or char[]. Added a wrapper class for SAMFileWriter that allows for adding reads out of order. Not even close to done, but I need to commit now to sync up with Andrey. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2712 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/indels/IndelRealigner.java | 1037 +++++++++++++++++ .../walkers/indels/IntervalCleanerWalker.java | 7 +- .../walkers/indels/SortingSAMFileWriter.java | 118 ++ .../sting/utils/AlignmentUtils.java | 17 +- .../sting/utils/SWPairwiseAlignment.java | 597 +--------- 5 files changed, 1189 insertions(+), 587 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java new file mode 100755 index 000000000..953ec070a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -0,0 +1,1037 @@ +package org.broadinstitute.sting.gatk.walkers.indels; + +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import net.sf.samtools.*; +import net.sf.samtools.util.StringUtil; + +import java.util.*; +import java.io.File; +import java.io.FileWriter; + +/** + * Performs local realignment of reads based on misalignments due to the presence of indels. + * Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. + */ +public class IndelRealigner extends ReadWalker { + + @Argument(fullName="intervals", shortName="intervals", doc="intervals file output from RealignerTargetCreator", required=true) + protected String intervalsFile = null; + + @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) + protected double LOD_THRESHOLD = 5.0; + @Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + protected double MISMATCH_THRESHOLD = 0.15; + + @Argument(fullName="OutputBam", shortName="O", required=false, doc="Output bam") + protected SAMFileWriter baseWriter = null; + + + @Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found") + String OUT_INDELS = null; + @Argument(fullName="OutputCleanedReadsOnly", shortName="cleanedOnly", doc="print out cleaned reads only (otherwise, all reads within the intervals)", required=false) + boolean cleanedReadsOnly = false; + @Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false) + String OUT_STATS = null; + @Argument(fullName="SNPsFile", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out", required=false) + String OUT_SNPS = null; + @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) + int MAX_CONSENSUSES = 30; + @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) + int MAX_READS_FOR_CONSENSUSES = 120; + + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) + int MAX_READS = 20000; + + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + + // the reads that fall into the current interval + private ReadBin readsToClean = new ReadBin(); + + // the wrapper around the SAM writer + private SortingSAMFileWriter writer = null; + + // random number generator + private Random generator; + + public static final int MAX_QUAL = 99; + public static final long RANDOM_SEED = 1252863495; + + // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned + private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; + + private static final double SW_MATCH = 30.0; // 1.0; + private static final double SW_MISMATCH = -10.0; //-1.0/3.0; + private static final double SW_GAP = -10.0; //-1.0-1.0/3.0; + private static final double SW_GAP_EXTEND = -2.0; //-1.0/.0; + + // other output files + private FileWriter indelOutput = null; + private FileWriter statsOutput = null; + private FileWriter snpsOutput = null; + + public void initialize() { + + if ( LOD_THRESHOLD < 0.0 ) + throw new RuntimeException("LOD threshold cannot be a negative number"); + if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) + throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); + + List locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile)); + intervals = GenomeLocSortedSet.createSetFromList(locs).iterator(); + currentInterval = intervals.hasNext() ? intervals.next() : null; + + if ( baseWriter != null ) + writer = new SortingSAMFileWriter(baseWriter, 50); + + generator = new Random(RANDOM_SEED); + + if ( OUT_INDELS != null ) { + try { + indelOutput = new FileWriter(new File(OUT_INDELS)); + } catch (Exception e) { + logger.warn("Failed to create output file "+ OUT_INDELS+". Indel output will be suppressed"); + err.println(e.getMessage()); + indelOutput = null; + } + } + if ( OUT_STATS != null ) { + try { + statsOutput = new FileWriter(new File(OUT_STATS)); + } catch (Exception e) { + logger.warn("Failed to create output file "+ OUT_STATS+". Cleaning stats output will be suppressed"); + err.println(e.getMessage()); + statsOutput = null; + } + } + if ( OUT_SNPS != null ) { + try { + snpsOutput = new FileWriter(new File(OUT_SNPS)); + } catch (Exception e) { + logger.warn("Failed to create output file "+ OUT_SNPS+". Cleaning snps output will be suppressed"); + err.println(e.getMessage()); + snpsOutput = null; + } + } + } + + private void emit(SAMRecord read) { + if ( writer != null ) + writer.addAlignment(read); + } + + private void emit(List reads) { + if ( writer != null ) + writer.addAlignments(reads); + } + + public Integer map(char[] ref, SAMRecord read) { + if ( currentInterval == null ) { + emit(read); + return 0; + } + + GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); + + if ( readLoc.isBefore(currentInterval) ) { + emit(read); + return 0; + } + else if ( readLoc.overlapsP(currentInterval) ) { + if ( read.getReadUnmappedFlag() || + read.getDuplicateReadFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + Utils.is454Read(read) ) { + emit(read); + } + + readsToClean.add(read, ref); + if ( readsToClean.size() >= MAX_READS ) { + emit(readsToClean.getReads()); + readsToClean.clear(); + currentInterval = intervals.hasNext() ? intervals.next() : null; + } + } + else { // the read is past the current interval + clean(readsToClean); + emit(readsToClean.getReads()); + readsToClean.clear(); + currentInterval = intervals.hasNext() ? intervals.next() : null; + + // call back into map now that the state has been updated + map(ref, read); + } + + return 0; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + if ( readsToClean.size() > 0 ) { + clean(readsToClean); + emit(readsToClean.getReads()); + } + + writer.close(); + + if ( OUT_INDELS != null ) { + try { + indelOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_INDELS+" gracefully. Data may be corrupt."); + } + } + if ( OUT_STATS != null ) { + try { + statsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_STATS+" gracefully. Data may be corrupt."); + } + } + if ( OUT_SNPS != null ) { + try { + snpsOutput.close(); + } catch (Exception e) { + logger.error("Failed to close "+OUT_SNPS+" gracefully. Data may be corrupt."); + } + } + } + + + private static int mismatchQualitySumIgnoreCigar(AlignedRead aRead, byte[] refSeq, int refIndex) { + byte[] readSeq = aRead.getRead().getReadBases(); + byte[] quals = aRead.getRead().getBaseQualities(); + int sum = 0; + for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) { + if ( refIndex >= refSeq.length ) + sum += MAX_QUAL; + else { + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIndex]; + if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || + BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) + continue; // do not count Ns/Xs/etc ? + if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + sum += (int)quals[readIndex]; + } + } + return sum; + } + + private static boolean readIsClipped(SAMRecord read) { + final Cigar c = read.getCigar(); + final int n = c.numCigarElements(); + if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S || + c.getCigarElement(0).getOperator() == CigarOperator.S) return true; + return false; + } + + private void clean(ReadBin readsToClean) { + + List reads = readsToClean.getReads(); + byte[] reference = readsToClean.getRereference(); + long leftmostIndex = readsToClean.getLocation().getStart(); + + ArrayList refReads = new ArrayList(); // reads that perfectly match ref + ArrayList altReads = new ArrayList(); // reads that don't perfectly match + LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? + ArrayList leftMovedIndels = new ArrayList(); + Set altConsenses = new LinkedHashSet(); // list of alt consenses + int totalMismatchSum = 0; + + + // decide which reads potentially need to be cleaned + for ( SAMRecord read : reads ) { + + // if ( debugOn ) { + // System.out.println(read.getReadName()+" "+read.getCigarString()+" "+read.getAlignmentStart()+"-"+read.getAlignmentEnd()); + // System.out.println(reference.substring((int)(read.getAlignmentStart()-leftmostIndex),(int)(read.getAlignmentEnd()-leftmostIndex))); + // System.out.println(read.getReadString()); + // } + + // we currently can not deal with clipped reads correctly (or screwy record) + if ( read.getCigar().numCigarElements() == 0 || readIsClipped(read) ) { + refReads.add(read); + continue; + } + + AlignedRead aRead = new AlignedRead(read); + + // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence + int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); + if ( numBlocks == 2 ) { + Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0); + if ( aRead.setCigar(newCigar) ) { + leftMovedIndels.add(aRead); + } + } + + int mismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex); + // if ( debugOn ) System.out.println("mismatchScore="+mismatchScore); + + // if this doesn't match perfectly to the reference, let's try to clean it + if ( mismatchScore > 0 ) { + altReads.add(aRead); + if ( !read.getDuplicateReadFlag() ) + totalMismatchSum += mismatchScore; + aRead.setMismatchScoreToReference(mismatchScore); + // if it has an indel, let's see if that's the best consensus + if ( numBlocks == 2 ) { + Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases()); + if ( c==null) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName()); + else altConsenses.add(c); + } + else { + // if ( debugOn ) System.out.println("Going to test..."); + altAlignmentsToTest.add(aRead); + } + } + // otherwise, we can emit it as is + else { + // if ( debugOn ) System.out.println("Emitting as is..."); + refReads.add(read); + } + } + + // choose alternate consensuses + if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { + for ( AlignedRead aRead : altAlignmentsToTest ) { + // do a pairwise alignment against the reference + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); + if ( c != null) { + // if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ; + altConsenses.add(c); + } else { + // if ( debugOn ) System.out.println("FAILED to create Alt consensus from SW"); + } + } + } else { + // choose alternate consenses randomly + int readsSeen = 0; + while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsenses.size() <= MAX_CONSENSUSES) { + int index = generator.nextInt(altAlignmentsToTest.size()); + AlignedRead aRead = altAlignmentsToTest.remove(index); + // do a pairwise alignment against the reference + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); + if ( c != null) + altConsenses.add(c); + } + } + + Consensus bestConsensus = null; + Iterator iter = altConsenses.iterator(); + + // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); + + while ( iter.hasNext() ) { + Consensus consensus = iter.next(); + + // if ( debugOn ) System.out.println("Consensus: "+consensus.str); + + for ( int j = 0; j < altReads.size(); j++ ) { + AlignedRead toTest = altReads.get(j); + Pair altAlignment = findBestOffset(consensus.str, toTest); + + // the mismatch score is the min of its alignment vs. the reference and vs. the alternate + int myScore = altAlignment.second; + if ( myScore >= toTest.getMismatchScoreToReference() ) + myScore = toTest.getMismatchScoreToReference(); + // keep track of reads that align better to the alternate consensus. + // By pushing alignments with equal scores to the alternate, it means we'll over-call (het -> hom non ref) but are less likely to under-call (het -> ref, het non ref -> het) + else + consensus.readIndexes.add(new Pair(j, altAlignment.first)); + + //logger.debug(consensus.str + " vs. " + toTest.getRead().getReadString() + " => " + myScore + " - " + altAlignment.first); + if ( !toTest.getRead().getDuplicateReadFlag() ) + consensus.mismatchSum += myScore; + } + + //logger.debug(consensus.str + " " + consensus.mismatchSum); + if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { + bestConsensus = consensus; + //logger.debug(consensus.str + " " + consensus.mismatchSum); + } + } + + // if the best alternate consensus has a smaller sum of quality score mismatches (more than + // the LOD threshold), and it didn't just move around the mismatching columns, then clean! + double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0); + if ( improvement >= LOD_THRESHOLD ) { + + bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference); + + // start cleaning the appropriate reads + for ( Pair indexPair : bestConsensus.readIndexes ) { + AlignedRead aRead = altReads.get(indexPair.first); + updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, (int)leftmostIndex); + } + if ( !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { + if ( statsOutput != null ) { + try { + statsOutput.write(readsToClean.getLocation().toString()); + statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist) + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) {} + } + } else { + //logger.debug("CLEAN: " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " " + bestConsensus.str ); + if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { + // NOTE: indels are printed out in the format specified for the low-coverage pilot1 + // indel calls (tab-delimited): chr position size type sequence + StringBuilder str = new StringBuilder(); + str.append(reads.get(0).getReferenceName()); + int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength(); + str.append("\t" + (leftmostIndex + position - 1)); + CigarElement ce = bestConsensus.cigar.getCigarElement(1); + str.append("\t" + ce.getLength() + "\t" + ce.getOperator() + "\t"); + int length = ce.getLength(); + if ( ce.getOperator() == CigarOperator.D ) { + for ( int i = 0; i < length; i++) + str.append(reference[position+i]); + } else { + for ( int i = 0; i < length; i++) + str.append(bestConsensus.str[position+i]); + } + str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n"); + try { + indelOutput.write(str.toString()); + indelOutput.flush(); + } catch (Exception e) {} + } + if ( statsOutput != null ) { + try { + statsOutput.write(readsToClean.getLocation().toString()); + statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced + if ( bestConsensus.cigar.numCigarElements() > 1 ) + statsOutput.write(" (found indel)"); + statsOutput.write("\t"); + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) {} + } + + // We need to update the mapping quality score of the cleaned reads; + // however we don't have enough info to use the proper MAQ scoring system. + // For now, we'll use a heuristic: + // the mapping quality score is improved by the LOD difference in mismatching + // bases between the reference and alternate consensus (divided by 10) + + // finish cleaning the appropriate reads + for ( Pair indexPair : bestConsensus.readIndexes ) { + AlignedRead aRead = altReads.get(indexPair.first); + if ( aRead.finalizeUpdate() ) { + aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)(improvement/10.0), 255)); + aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex)); + } + } + } + + // END IF ( improvement >= LOD_THRESHOLD ) + + } else if ( statsOutput != null ) { + try { + statsOutput.write(readsToClean.getLocation().toString()); + statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD + statsOutput.write(Double.toString(improvement)); + statsOutput.write("\n"); + statsOutput.flush(); + } catch (Exception e) {} + } + } + + private Consensus createAlternateConsensus(int indexOnRef, Cigar c, byte[] reference, byte[] readStr) { + if ( indexOnRef < 0 ) + return null; + + // create the new consensus + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < indexOnRef; i++) + sb.append(reference[i]); + //logger.debug("CIGAR = " + AlignmentUtils.cigarToString(c)); + + int indelCount = 0; + int altIdx = 0; + int refIdx = indexOnRef; + boolean ok_flag = true; + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case D: + indelCount++; + refIdx += elementLength; + break; + case M: + if ( reference.length < refIdx + elementLength ) + ok_flag = false; + else { + for (int j = 0; j < elementLength; j++) + sb.append(reference[refIdx+j]); + } + refIdx += elementLength; + altIdx += elementLength; + break; + case I: + for (int j = 0; j < elementLength; j++) + sb.append((char)readStr[altIdx + j]); + altIdx += elementLength; + indelCount++; + break; + } + } + // make sure that there is at most only a single indel and it aligns appropriately! + if ( !ok_flag || indelCount != 1 || reference.length < refIdx ) + return null; + + for (int i = refIdx; i < reference.length; i++) + sb.append(reference[i]); + byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the cuurent read + + // if ( debugOn ) System.out.println("Alt consensus generated: "+altConsensus); + + return new Consensus(altConsensus, c, indexOnRef); + } + + private Pair findBestOffset(byte[] ref, AlignedRead read) { + int attempts = ref.length - read.getReadLength() + 1; + int bestScore = mismatchQualitySumIgnoreCigar(read, ref, 0); + int bestIndex = 0; + for ( int i = 1; i < attempts; i++ ) { + // we can't get better than 0! + if ( bestScore == 0 ) + return new Pair(bestIndex, 0); + int score = mismatchQualitySumIgnoreCigar(read, ref, i); + if ( score < bestScore ) { + bestScore = score; + bestIndex = i; + } + } + return new Pair(bestIndex, bestScore); + } + + + private void updateRead(Cigar altCigar, int altPosOnRef, int myPosOnAlt, AlignedRead aRead, int leftmostIndex) { + Cigar readCigar = new Cigar(); + + // special case: there is no indel + if ( altCigar.getCigarElements().size() == 1 ) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); + readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + aRead.setCigar(readCigar); + return; + } + + CigarElement altCE1 = altCigar.getCigarElement(0); + CigarElement altCE2 = altCigar.getCigarElement(1); + + // the easiest thing to do is to take each case separately + int endOfFirstBlock = altPosOnRef + altCE1.getLength(); + boolean sawAlignmentStart = false; + + // for reads starting before the indel + if ( myPosOnAlt < endOfFirstBlock) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt); + sawAlignmentStart = true; + + // for reads ending before the indel + if ( myPosOnAlt + aRead.getReadLength() <= endOfFirstBlock) { + readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + aRead.setCigar(readCigar); + return; + } + readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M)); + } + + int indelOffsetOnRef = 0, indelOffsetOnRead = 0; + // forward along the indel + if ( altCE2.getOperator() == CigarOperator.I ) { + // for reads that end in an insertion + if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + altCE2.getLength() ) { + readCigar.add(new CigarElement(myPosOnAlt + aRead.getReadLength() - endOfFirstBlock, CigarOperator.I)); + aRead.setCigar(readCigar); + return; + } + + // for reads that start in an insertion + if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + altCE2.getLength() ) { + aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock); + readCigar.add(new CigarElement(altCE2.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I)); + indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; + sawAlignmentStart = true; + } else if ( sawAlignmentStart ) { + readCigar.add(altCE2); + indelOffsetOnRead = altCE2.getLength(); + } + } else if ( altCE2.getOperator() == CigarOperator.D ) { + if ( sawAlignmentStart ) + readCigar.add(altCE2); + indelOffsetOnRef = altCE2.getLength(); + } else { + throw new RuntimeException("Operator of middle block is not I or D: " + altCE2.getOperator()); + } + + // for reads that start after the indel + if ( !sawAlignmentStart ) { + aRead.setAlignmentStart(leftmostIndex + myPosOnAlt + indelOffsetOnRef - indelOffsetOnRead); + readCigar.add(new CigarElement(aRead.getReadLength(), CigarOperator.M)); + aRead.setCigar(readCigar); + return; + } + + int readRemaining = aRead.getReadLength(); + for ( CigarElement ce : readCigar.getCigarElements() ) { + if ( ce.getOperator() != CigarOperator.D ) + readRemaining -= ce.getLength(); + } + if ( readRemaining > 0 ) + readCigar.add(new CigarElement(readRemaining, CigarOperator.M)); + aRead.setCigar(readCigar); + } + + private boolean alternateReducesEntropy(List reads, byte[] reference, long leftmostIndex) { + int[] originalMismatchBases = new int[reference.length]; + int[] cleanedMismatchBases = new int[reference.length]; + int[] totalOriginalBases = new int[reference.length]; + int[] totalCleanedBases = new int[reference.length]; + + // set to 1 to prevent dividing by zero + for ( int i=0; i < reference.length; i++ ) + originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 1; + + for (int i=0; i < reads.size(); i++) { + AlignedRead read = reads.get(i); + if ( read.getRead().getAlignmentBlocks().size() > 1 ) + continue; + + int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex; + byte[] readStr = read.getRead().getReadBases(); + byte[] quals = read.getRead().getBaseQualities(); + + for (int j=0; j < readStr.length; j++, refIdx++ ) { + if ( refIdx < 0 || refIdx >= reference.length ) { + //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() ); + //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); + break; + } + totalOriginalBases[refIdx] += quals[j]; + if ( readStr[j] != reference[refIdx] ) + originalMismatchBases[refIdx] += quals[j]; + } + + // reset and now do the calculation based on the cleaning + refIdx = read.getAlignmentStart() - (int)leftmostIndex; + int altIdx = 0; + Cigar c = read.getCigar(); + for (int j = 0 ; j < c.numCigarElements() ; j++) { + CigarElement ce = c.getCigarElement(j); + int elementLength = ce.getLength(); + switch ( ce.getOperator() ) { + case M: + for (int k = 0 ; k < elementLength ; k++, refIdx++, altIdx++ ) { + if ( refIdx >= reference.length ) + break; + totalCleanedBases[refIdx] += quals[altIdx]; + if ( readStr[altIdx] != reference[refIdx] ) + cleanedMismatchBases[refIdx] += quals[altIdx]; + } + break; + case I: + altIdx += elementLength; + break; + case D: + refIdx += elementLength; + break; + } + + } + } + + int originalMismatchColumns = 0, cleanedMismatchColumns = 0; + StringBuilder sb = new StringBuilder(); + for ( int i=0; i < reference.length; i++ ) { + if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) + continue; + boolean didMismatch = false, stillMismatches = false; + if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD ) { + didMismatch = true; + originalMismatchColumns++; + if ( (cleanedMismatchBases[i] / totalCleanedBases[i]) > (originalMismatchBases[i] / totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { + stillMismatches = true; + cleanedMismatchColumns++; + } + } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) { + cleanedMismatchColumns++; + } + if ( snpsOutput != null ) { + if ( didMismatch ) { + sb.append(reads.get(0).getRead().getReferenceName() + ":"); + sb.append(((int)leftmostIndex + i)); + if ( stillMismatches ) + sb.append(" SAME_SNP\n"); + else + sb.append(" NOT_SNP\n"); + } + } + } + + //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); + + boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); + if ( reduces && snpsOutput != null ) { + try { + snpsOutput.write(sb.toString()); + snpsOutput.flush(); + } catch (Exception e) {} + } + return reduces; + } + + /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq + * starting at 0-based position refIndex on the refSeq and specified by its cigar. + * The last argument readIndex specifies 0-based position on the read where the alignment described by the + * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. + * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case + * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are + * always the positions where the cigar starts on the ref and on the read, respectively. + * + * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar + * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT + * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence + * is not repeated), the original cigar is returned. + * @param cigar structure of the original alignment + * @param refSeq reference sequence the read is aligned to + * @param readSeq read sequence + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) + */ + private Cigar indelRealignment(Cigar cigar, byte[] refSeq, byte[] readSeq, int refIndex, int readIndex) { + if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do + + CigarElement ce1 = cigar.getCigarElement(0); + CigarElement ce2 = cigar.getCigarElement(1); + + // we currently can not handle clipped reads; alternatively, if the alignment starts from insertion, there + // is no place on the read to move that insertion further left; so we are done: + if ( ce1.getOperator() != CigarOperator.M ) return cigar; + + int difference = 0; // we can move indel 'difference' bases left + final int indel_length = ce2.getLength(); + + String indelString; // inserted or deleted sequence + int period = 0; // period of the inserted/deleted sequence + int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion) + int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion) + + if ( ce2.getOperator() == CigarOperator.D ) + indelString = new String(refSeq, indelIndexOnRef, ce2.getLength()).toUpperCase(); // deleted bases + else if ( ce2.getOperator() == CigarOperator.I ) + indelString = new String(readSeq, indelIndexOnRead, ce2.getLength()).toUpperCase(); // get the inserted bases + else + // we can get here if there is soft clipping done at the beginning of the read + // for now, we'll just punt the issue and not try to realign these + return cigar; + + // now we have to check all WHOLE periods of the indel sequence: + // for instance, if + // REF: AGCTATATATAGCC + // READ: GCTAT***TAGCC + // the deleted sequence ATA does have period of 2, but deletion obviously can not be + // shifted left by 2 bases (length 3 does not contain whole number of periods of 2); + // however if 4 bases are deleted: + // REF: AGCTATATATAGCC + // READ: GCTA****TAGCC + // the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases! + // Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest + // period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check + // any other periods. + + // NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence + // repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion + // position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat + // on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach. + + while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength + + period = BaseUtils.sequencePeriod(indelString, period+1); + + if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work + + int newIndex = indelIndexOnRef; + + while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted + + // lets check if bases [newIndex-period,newIndex) immediately preceding the indel on the ref + // are the same as the currently checked period of the inserted sequence: + + boolean match = true; + + for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) { + char indelChr = indelString.charAt(indelPos); + if ( Character.toUpperCase(refSeq[testRefPos]) != indelChr || BaseUtils.simpleBaseToBaseIndex(indelChr) == -1 ) { + match = false; + break; + } + } + if ( match ) + newIndex -= period; // yes, they are the same, we can move indel farther left by at least period bases, go check if we can do more... + else break; // oops, no match, can not push indel farther left + } + + final int newDifference = indelIndexOnRef - newIndex; + if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left + + if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already + // got maximum possible shift after checking period=1 above. + } + + // if ( ce2.getLength() >= 2 ) + // System.out.println("-----------------------------------\n FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0))); + + + if ( difference > 0 ) { + + // The following if() statement: this should've never happened, unless the alignment is really screwed up. + // A real life example: + // + // ref: TTTTTTTTTTTTTTTTTT******TTTTTACTTATAGAAGAAAT... + // read: GTCTTTTTTTTTTTTTTTTTTTTTTTACTTATAGAAGAAAT... + // + // i.e. the alignment claims 6 T's to be inserted. The alignment is clearly malformed/non-conforming since we could + // have just 3 T's inserted (so that the beginning of the read maps right onto the beginning of the + // reference fragment shown): that would leave us with same 2 mismatches at the beginning of the read + // (G and C) but lower gap penalty. Note that this has nothing to do with the alignment being "right" or "wrong" + // with respect to where on the DNA the read actually came from. It is the assumptions of *how* the alignments are + // built and represented that are broken here. While it is unclear how the alignment shown above could be generated + // in the first place, we are not in the business of fixing incorrect alignments in this method; all we are + // trying to do is to left-adjust correct ones. So if something like that happens, we refuse to change the cigar + // and bail out. + if ( ce1.getLength()-difference < 0 ) return cigar; + + Cigar newCigar = new Cigar(); + // do not add leading M cigar element if its length is zero (i.e. if we managed to left-shift the + // insertion all the way to the read start): + if ( ce1.getLength() - difference > 0 ) + newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M)); + newCigar.add(ce2); // add the indel, now it's left shifted since we decreased the number of preceding matching bases + + if ( cigar.numCigarElements() > 2 ) { + // if we got something following the indel element: + + if ( cigar.getCigarElement(2).getOperator() == CigarOperator.M ) { + // if indel was followed by matching bases (that's the most common situation), + // increase the length of the matching section after the indel by the amount of left shift + // (matching bases that were on the left are now *after* the indel; we have also checked at the beginning + // that the first cigar element was also M): + newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M)); + } else { + // if the element after the indel was not M, we have to add just the matching bases that were on the left + // and now appear after the indel after we performed the shift. Then add the original element that followed the indel. + newCigar.add(new CigarElement(difference, CigarOperator.M)); + newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength(),cigar.getCigarElement(2).getOperator())); + } + // now add remaining (unchanged) cigar elements, if any: + for ( int i = 3 ; i < cigar.numCigarElements() ; i++ ) { + newCigar.add(new CigarElement(cigar.getCigarElement(i).getLength(),cigar.getCigarElement(i).getOperator())); + } + } + + //logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar)); + cigar = newCigar; + + } + return cigar; + } + + private class AlignedRead { + private SAMRecord read; + private Cigar newCigar = null; + private int newStart = -1; + private int mismatchScoreToReference; + private boolean updated = false; + + public AlignedRead(SAMRecord read) { + this.read = read; + mismatchScoreToReference = 0; + } + + public SAMRecord getRead() { + return read; + } + + public int getReadLength() { + return read.getReadLength(); + } + + public Cigar getCigar() { + return (newCigar != null ? newCigar : read.getCigar()); + } + + // tentatively sets the new Cigar, but it needs to be confirmed later + // returns true if the new cigar is a valid change (i.e. not same as original and doesn't remove indel) + public boolean setCigar(Cigar cigar) { + if ( getCigar().equals(cigar) ) + return false; + + String str = AlignmentUtils.cigarToString(cigar); + if ( !str.contains("D") && !str.contains("I") ) + return false; + + newCigar = cigar; + return true; + } + + // tentatively sets the new start, but it needs to be confirmed later + public void setAlignmentStart(int start) { + newStart = start; + } + + public int getAlignmentStart() { + return (newStart != -1 ? newStart : read.getAlignmentStart()); + } + + public int getOriginalAlignmentStart() { + return read.getAlignmentStart(); + } + + // finalizes the changes made. + // returns true if this record actually changes, false otherwise + public boolean finalizeUpdate() { + // if we haven't made any changes, don't do anything + if ( newCigar == null ) + return false; + if ( newStart == -1 ) + newStart = read.getAlignmentStart(); + + // if it's a paired end read, we need to update the insert size + if ( read.getReadPairedFlag() ) { + int insertSize = read.getInferredInsertSize(); + if ( insertSize > 0 ) { + read.setCigar(newCigar); + read.setInferredInsertSize(insertSize + read.getAlignmentStart() - newStart); + read.setAlignmentStart(newStart); + } else { + // note that the correct order of actions is crucial here + int oldEnd = read.getAlignmentEnd(); + read.setCigar(newCigar); + read.setAlignmentStart(newStart); + read.setInferredInsertSize(insertSize + oldEnd - read.getAlignmentEnd()); + } + } else { + read.setCigar(newCigar); + read.setAlignmentStart(newStart); + } + updated = true; + return true; + } + + public boolean wasUpdated() { + return updated; + } + + public void setMismatchScoreToReference(int score) { + mismatchScoreToReference = score; + } + + public int getMismatchScoreToReference() { + return mismatchScoreToReference; + } + } + + private class Consensus { + public byte[] str; + public int mismatchSum; + public int positionOnReference; + public Cigar cigar; + public ArrayList> readIndexes; + + public Consensus(byte[] str, Cigar cigar, int positionOnReference) { + this.str = str; + this.cigar = cigar; + this.positionOnReference = positionOnReference; + mismatchSum = 0; + readIndexes = new ArrayList>(); + } + + public boolean equals(Object o) { + return ( this == o || (o instanceof Consensus && this.str.equals(((Consensus)o).str)) ); + } + + public boolean equals(Consensus c) { + return ( this == c || this.str.equals(c.str) ); + } + } + + private class ReadBin { + + private ArrayList reads = new ArrayList(); + private char[] reference = null; + private GenomeLoc loc = null; + + public ReadBin() { } + + public void add(SAMRecord read, char[] ref) { + reads.add(read); + + // set up the reference + if ( reference == null ) { + reference = ref; + loc = GenomeLocParser.createGenomeLoc(read); + } else { + long lastPosWithRefBase = loc.getStart() + reference.length -1; + int neededBases = (int)(read.getAlignmentEnd() - lastPosWithRefBase); + char[] newReference = new char[reference.length + neededBases]; + System.arraycopy(reference, 0, newReference, 0, reference.length); + System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases); + reference = newReference; + } + } + + public List getReads() { return reads; } + + public byte[] getRereference() { + // upper case it + for ( int i = 0; i < reference.length; i++ ) + reference[i] = Character.toUpperCase(reference[i]); + + // convert it to a byte array + byte[] refArray = new byte[reference.length]; + StringUtil.charsToBytes(reference, 0, reference.length, refArray, 0); + return refArray; + } + + public GenomeLoc getLocation() { return loc; } + + public int size() { return reads.size(); } + + public void clear() { + reads.clear(); + reference = null; + loc = null; + } + + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java index 96159f9b6..0b63c5bb1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.utils.cmdLine.Argument; import net.sf.samtools.*; +import net.sf.samtools.util.StringUtil; import java.util.*; import java.io.File; @@ -279,7 +280,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { for ( AlignedRead aRead : altAlignmentsToTest ) { // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(StringUtil.stringToBytes(reference), aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); if ( c != null) { // if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ; @@ -295,7 +296,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker int index = generator.nextInt(altAlignmentsToTest.size()); AlignedRead aRead = altAlignmentsToTest.remove(index); // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(StringUtil.stringToBytes(reference), aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); if ( c != null) altConsenses.add(c); @@ -404,7 +405,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker AlignedRead aRead = altReads.get(indexPair.first); if ( aRead.finalizeUpdate() ) { aRead.getRead().setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + (int)(improvement/10.0), 255)); - aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex)); + aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), StringUtil.stringToBytes(reference), aRead.getRead().getAlignmentStart()-(int)leftmostIndex)); } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java new file mode 100755 index 000000000..72bd87afe --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java @@ -0,0 +1,118 @@ +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; + +import java.util.TreeSet; +import java.util.Iterator; +import java.util.List; + +/** + * @author ebanks + * SortingSAMFileWriter + * + * this class extends the samtools SAMFileWriter class and caches reads for N loci so that reads + * can be emitted out of order (provided they are within the N-locus window) + * + */ +public class SortingSAMFileWriter implements SAMFileWriter { + + // the base writer from Picard + private SAMFileWriter baseWriter; + + // the window over which we agree to accumulate reads + private int window; + + // the reads we are accumulating + private TreeSet cachedReads = new TreeSet(new SAMRecordCoordinateComparator()); + + + /** + * Constructor + * + * @param baseWriter the real SAMFileWriter + * @param window the window over which we agree to store reads + */ + public SortingSAMFileWriter(SAMFileWriter baseWriter, int window) { + this.baseWriter = baseWriter; + this.window = window; + } + + /** + * Add a read to the writer for emission + * + * @param read the read to emit + */ + public void addAlignment(SAMRecord read) { + // at a new contig, clear the cache + if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < read.getReferenceIndex() ) + clearCache(); + + long currentPos = read.getAlignmentStart(); + + Iterator iter = cachedReads.iterator(); + while ( iter.hasNext() ) { + SAMRecord cachedRead = iter.next(); + if ( currentPos - cachedRead.getAlignmentStart() >= window ) { + baseWriter.addAlignment(cachedRead); + iter.remove(); + } else { + break; + } + } + + cachedReads.add(read); + } + + /** + * Add a list of reads to the writer for emission; the reads do NOT need to be sorted + * + * @param reads the reads to emit + */ + public void addAlignments(List reads) { + if ( reads.size() == 0 ) + return; + + // at a new contig, clear the cache + if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < reads.get(0).getReferenceIndex() ) + clearCache(); + + cachedReads.addAll(reads); + + // get the last read in the cache + SAMRecord last = cachedReads.last(); + + long currentPos = last.getAlignmentStart(); + + Iterator iter = cachedReads.iterator(); + while ( iter.hasNext() ) { + SAMRecord cachedRead = iter.next(); + if ( currentPos - cachedRead.getAlignmentStart() >= window ) { + baseWriter.addAlignment(cachedRead); + iter.remove(); + } else { + break; + } + } + } + + /** + * get the SAM file header + */ + public SAMFileHeader getFileHeader() { + return baseWriter.getFileHeader(); + } + + /** + * close this writer by clearing the cache + */ + public void close() { + clearCache(); + } + + private void clearCache() { + Iterator iter = cachedReads.iterator(); + while ( iter.hasNext() ) + baseWriter.addAlignment(iter.next()); + cachedReads.clear(); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java index d8d263b9e..2499f58eb 100644 --- a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java @@ -4,6 +4,7 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; +import net.sf.samtools.util.StringUtil; import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.pileup.*; @@ -113,11 +114,11 @@ public class AlignmentUtils { /** See {@link #numMismatches(SAMRecord, ReferenceSequence)}. This method implements same functionality * for reference sequence specified as conventional java string (of bases). By default, it is assumed that * the alignment starts at (1-based) position r.getAlignmentStart() on the reference refSeq. - * See {@link #numMismatches(SAMRecord, String, int)} if this is not the case. + * See {@link #numMismatches(SAMRecord, byte[], int)} if this is not the case. */ public static int numMismatches(SAMRecord r, String refSeq ) { if ( r.getReadUnmappedFlag() ) return 1000000; - return numMismatches(r, refSeq, r.getAlignmentStart()-1); + return numMismatches(r, StringUtil.stringToBytes(refSeq), r.getAlignmentStart()-1); } /** Returns number of mismatches in the alignment r to the reference sequence @@ -125,6 +126,8 @@ public class AlignmentUtils { * specified reference sequence; in other words, refIndex is used in place of alignment's own * getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment r * (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected. + * + * THIS CODE ASSUMES THAT ALL BYTES COME FROM UPPERCASED CHARS. * * @param r alignment * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of @@ -132,7 +135,7 @@ public class AlignmentUtils { * @param refIndex zero-based position, at which the alignment starts on the specified reference string. * @return the number of mismatches */ - public static int numMismatches(SAMRecord r, String refSeq, int refIndex) { + public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) { int readIdx = 0; int mismatches = 0; byte[] readSeq = r.getReadBases(); @@ -142,15 +145,15 @@ public class AlignmentUtils { switch ( ce.getOperator() ) { case M: for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIdx++ ) { - if ( refIndex >= refSeq.length() ) + if ( refIndex >= refSeq.length ) continue; - char refChr = refSeq.charAt(refIndex); - char readChr = (char)readSeq[readIdx]; + byte refChr = refSeq[refIndex]; + byte readChr = readSeq[readIdx]; // Note: we need to count X/N's as mismatches because that's what SAM requires //if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || // BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) // continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + if ( readChr != refChr ) mismatches++; } break; diff --git a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java index 2f31b4cc7..69ad52469 100755 --- a/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java +++ b/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java @@ -16,10 +16,6 @@ import java.util.Collections; * To change this template use File | Settings | File Templates. */ public class SWPairwiseAlignment { - private String s1; - private String s2; - private int i1; - private int i2; private int alignment_offset; // offset of s2 w/respect to s1 private Cigar alignmentCigar; @@ -28,428 +24,48 @@ public class SWPairwiseAlignment { private double w_open; private double w_extend; - private int best_mm; // mismatch count - private static final int IMPOSSIBLE = 1000000000; private static final int MSTATE = 0; private static final int ISTATE = 1; private static final int DSTATE = 2; - public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2, double match, double mismatch, double open, double extend ) { - s1 = seq1; - s2 = seq2; - i1 = id1; - i2 = id2; + // ************************************************************************ + // **** IMPORTANT NOTE: **** + // **** This class assumes that all bytes come from UPPERCASED chars! **** + // ************************************************************************ + public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) { w_match = match; w_mismatch = mismatch; w_open = open; w_extend = extend; - best_mm = IMPOSSIBLE; - //next_mm = IMPOSSIBLE; - align4(s1,s2); + align(seq1,seq2); } - public SWPairwiseAlignment(String seq1, String seq2, int id1, int id2) { - this(seq1,seq2,id1,id2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) - } - - /** Initializes the alignment with pair of sequences (that will be immediately aligned) and - * sets their external ids to -1. Such un-annotated pairwise alignment can not be added to MultipleAlignment. - * - */ - public SWPairwiseAlignment(String seq1, String seq2) { - this(seq1,seq2,-1,-1); - } - - public SWPairwiseAlignment(String seq1, String seq2, double match, double mismatch, double open, double extend) { - this(seq1,seq2,-1,-1,match,mismatch,open, extend); + public SWPairwiseAlignment(byte[] seq1, byte[] seq2) { + this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3) } public Cigar getCigar() { return alignmentCigar ; } public int getAlignmentStart2wrt1() { return alignment_offset; } - public void align(String a, String b) { - int n = a.length(); - int m = b.length(); - int [][] sw = new int[n+1][m+1]; - - // build smith-waterman matrix: - for ( int i = 1 ; i < n+1 ; i++ ) { - char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos - for ( int j = 1 ; j < m+1 ; j++ ) { - char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos - int step_diag = sw[i-1][j-1] + w(a_base,b_base); - int step_down = sw[i-1][j]+w(a_base,'-'); - int step_right = sw[i][j-1]+w('-',b_base); - - sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - } - -// print(sw,a,b); - - PrimitivePair.Int p = new PrimitivePair.Int(); - int maxscore = 0; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) - - // look for largest score. we use >= combined with the traversal direction - // to ensure that if two scores are equal, the one closer to diagonal gets picked - for ( int i = 1 ; i < n+1 ; i++ ) { - if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; } - } - for ( int j = 1 ; j < m+1 ; j++ ) { - if ( sw[n][j] > maxscore || - sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) { - p.first = n; - p.second = j ; - maxscore = sw[n][j]; - segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment - } - } - -// System.out.println("\ni="+p.first+"; j="+p.second); - - // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - - - // we will be placing all insertions and deletions into sequence b, so the state are named w/regard - // to that sequence - - int state = MSTATE; - - int [] scores = new int[3]; - - List lce = new ArrayList(5); - - do { - scores[ISTATE] = sw[p.first][p.second-1]; // moving left: same base on a, prev base on b = insertion on b - scores[DSTATE] = sw[p.first-1][p.second]; // moving up: same base on b, prev base on a = deletion on b - scores[MSTATE] = sw[p.first-1][p.second-1]; // moving diagonal : mathc/mismatch - - int new_state = findMaxInd(scores,MSTATE); - - // move to next best location in the sw matrix: - switch( new_state ) { - case MSTATE: p.first--; p.second--; break; - case ISTATE: p.second--; break; - case DSTATE: p.first--; break; - } - - // now let's see if the state actually changed: - if ( new_state == state ) segment_length++; - else { - // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - } - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - segment_length = 1; - state = new_state; - } - } while ( scores[state] != 0 ); - - // post-process the last segment we are still keeping - CigarOperator o=null; - - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - } - - alignment_offset = p.first - p.second; - segment_length+=p.second; - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - Collections.reverse(lce); - alignmentCigar = new Cigar(lce); - } - - /** Allows for separate gap opening end extension penalties, no direct backtracking. - * - * @param a - * @param b - */ - public void align2(String a, String b) { - int n = a.length(); - int m = b.length(); - double [][] sw = new double[n+1][m+1]; - - // build smith-waterman matrix: - for ( int i = 1 ; i < n+1 ; i++ ) { - char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos - for ( int j = 1 ; j < m+1 ; j++ ) { - char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos - double step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_down = 0.0 ; - for ( int k = 1 ; k < i ; k++ ) step_down = Math.max(step_down,sw[i-k][j]+wk(a_base,'-',k)); - - double step_right = 0; - for ( int k = 1 ; k < j ; k++ ) step_right = Math.max(step_right,sw[i][j-k]+wk('-',b_base,k)); - - sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - } - -// print(sw,a,b); - - PrimitivePair.Int p = new PrimitivePair.Int(); - double maxscore = 0.0; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) - - // look for largest score. we use >= combined with the traversal direction - // to ensure that if two scores are equal, the one closer to diagonal gets picked - for ( int i = 1 ; i < n+1 ; i++ ) { - if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; } - } - for ( int j = 1 ; j < m+1 ; j++ ) { - if ( sw[n][j] > maxscore || - sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) { - p.first = n; - p.second = j ; - maxscore = sw[n][j]; - segment_length = m - j; // end of sequence 2 is overhanging; we will just record it as 'M' segment - } - } - -// System.out.println("\ni="+p.first+"; j="+p.second); - - // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - - - // we will be placing all insertions and deletions into sequence b, so the state are named w/regard - // to that sequence - - int state = MSTATE; - - double [] scores = new double[3]; - - List lce = new ArrayList(5); - - do { - // moving left: same base on a, prev base on b = insertion on b: - scores[ISTATE] = sw[p.first][p.second-1] ; - scores[DSTATE] = sw[p.first - 1][p.second]; - scores[MSTATE] = sw[p.first-1][p.second-1]; // moving diagonal : match/mismatch - - // System.out.println("i = " + p.first + " ; j = " + p.second); - // System.out.println("s(M)="+scores[MSTATE]+"; s(D)="+scores[DSTATE]+"; s(I)=" + scores[ISTATE]); - int new_state = findMaxInd(scores,MSTATE); - - // move to next best location in the sw matrix: - switch( new_state ) { - case MSTATE: p.first--; p.second--; break; - case ISTATE: p.second--; break; - case DSTATE: p.first--; break; - } - - // now let's see if the state actually changed: - if ( new_state == state ) segment_length++; - else { - // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - } - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - segment_length = 1; - state = new_state; - } - } while ( scores[state] != 0 ); - - // post-process the last segment we are still keeping - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - } - alignment_offset = p.first - p.second; - segment_length+=p.second; - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - Collections.reverse(lce); - alignmentCigar = new Cigar(lce); - } - - - - - /** Allows for separate gap opening and extension penalties, with backtracking. - * - * @param a - * @param b - */ -public void align3(String a, String b) { - int n = a.length(); - int m = b.length(); - double [][] sw = new double[n+1][m+1]; - - int [][] btrack = new int[n+1][m+1]; - - // build smith-waterman matrix and keep backtrack info: - for ( int i = 1 ; i < n+1 ; i++ ) { - char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos - for ( int j = 1 ; j < m+1 ; j++ ) { - char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos - double step_diag = sw[i-1][j-1] + wd(a_base,b_base); - double step_down = 0.0 ; - int kd = 0; - for ( int k = 1 ; k < i ; k++ ) { - if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) { - step_down=sw[i-k][j]+wk(a_base,'-',k); - kd = k; - } - } - - double step_right = 0; - int ki = 0; - for ( int k = 1 ; k < j ; k++ ) { - if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) { - step_right=sw[i][j-k]+wk('-',b_base,k); - ki = k; - } - } - - if ( step_down > step_right ) { - if ( step_down > step_diag ) { - sw[i][j] = Math.max(0,step_down); - btrack[i][j] = kd; // positive=vertical - } - else { - sw[i][j] = Math.max(0,step_diag); - btrack[i][j] = 0; // 0 = diagonal - } - } else { - // step_down < step_right - if ( step_right > step_diag ) { - sw[i][j] = Math.max(0,step_right); - btrack[i][j] = -ki; // negative = horizontal - } else { - sw[i][j] = Math.max(0,step_diag); - btrack[i][j] = 0; // 0 = diagonal - } - } - sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right))); - } - } - -// print(sw,a,b); - - PrimitivePair.Int p = new PrimitivePair.Int(); - double maxscore = 0.0; - int segment_length = 0; // length of the segment (continuous matches, insertions or deletions) - - // look for largest score. we use >= combined with the traversal direction - // to ensure that if two scores are equal, the one closer to diagonal gets picked - for ( int i = 1 ; i < n+1 ; i++ ) { - if ( sw[i][m] >= maxscore ) { p.first = i; p.second = m ; maxscore = sw[i][m]; } - } - for ( int j = 1 ; j < m+1 ; j++ ) { - if ( sw[n][j] > maxscore || - sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p.first-p.second)) { - p.first = n; - p.second = j ; - maxscore = sw[n][j]; - segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment - } - } - - System.out.println("\ni="+p.first+"; j="+p.second); - - // p holds the position we start backtracking from; we will be assembling a cigar in the backwards order - - - // we will be placing all insertions and deletions into sequence b, so the state are named w/regard - // to that sequence - - int state = MSTATE; - - double [] scores = new double[3]; - - List lce = new ArrayList(5); - - do { - - int btr = btrack[p.first][p.second]; - int step_left = ( btr < 0 ? -btr : 1); - int step_up = ( btr > 0 ? btr : 1 ); - - // moving left: same base on a, prev base on b = insertion on b: - scores[ISTATE] = sw[p.first][p.second-step_left] ; - scores[DSTATE] = sw[p.first - step_up][p.second]; - scores[MSTATE] = sw[p.first-1][p.second-1]; // moving diagonal : match/mismatch - - // System.out.println("i = " + p.first + " ; j = " + p.second); - // System.out.println("s(M)="+scores[MSTATE]+"; s(D)="+scores[DSTATE]+"; s(I)=" + scores[ISTATE]); - int new_state = findMaxInd(scores,MSTATE); - - int step_length = 1; - - // move to next best location in the sw matrix: - switch( new_state ) { - case MSTATE: p.first--; p.second--; break; - case ISTATE: p.second-=step_left; step_length = step_left; break; - case DSTATE: p.first-=step_up; step_length = step_up; break; - } - - // now let's see if the state actually changed: - if ( new_state == state ) segment_length+=step_length; - else { - // state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match). - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - } - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - segment_length = step_length; - state = new_state; - } - } while ( scores[state] != 0 ); - - // post-process the last segment we are still keeping - CigarOperator o=null; - switch(state) { - case MSTATE: o = CigarOperator.M; break; - case ISTATE: o = CigarOperator.I; break; - case DSTATE: o = CigarOperator.D; break; - } - alignment_offset = p.first - p.second; - segment_length+=p.second; - CigarElement e = new CigarElement(segment_length,o); - lce.add(e); - Collections.reverse(lce); - alignmentCigar = new Cigar(lce); - } - - public void align4(String a, String b) { - int n = a.length(); - int m = b.length(); + public void align(final byte[] a, final byte[] b) { + int n = a.length; + int m = b.length; double [][] sw = new double[n+1][m+1]; int [][] btrack = new int[n+1][m+1]; // build smith-waterman matrix and keep backtrack info: for ( int i = 1 ; i < n+1 ; i++ ) { - char a_base = Character.toUpperCase(a.charAt(i-1)); // letter in a at the current pos + byte a_base = a[i-1]; // letter in a at the current pos for ( int j = 1 ; j < m+1 ; j++ ) { - char b_base = Character.toUpperCase(b.charAt(j-1)); // letter in b at the current pos + byte b_base = b[j-1]; // letter in b at the current pos double step_diag = sw[i-1][j-1] + wd(a_base,b_base); double step_down = 0.0 ; int kd = 0; for ( int k = 1 ; k < i ; k++ ) { - if ( step_down < sw[i-k][j]+wk(a_base,'-',k) ) { - step_down=sw[i-k][j]+wk(a_base,'-',k); + if ( step_down < sw[i-k][j]+wk(k) ) { + step_down=sw[i-k][j]+wk(k); kd = k; } } @@ -457,8 +73,8 @@ public void align3(String a, String b) { double step_right = 0; int ki = 0; for ( int k = 1 ; k < j ; k++ ) { - if ( step_right < sw[i][j-k]+wk('-',b_base,k) ) { - step_right=sw[i][j-k]+wk('-',b_base,k); + if ( step_right < sw[i][j-k]+wk(k) ) { + step_right=sw[i][j-k]+wk(k); ki = k; } } @@ -517,8 +133,6 @@ public void align3(String a, String b) { int state = MSTATE; - double [] scores = new double[3]; - List lce = new ArrayList(5); do { @@ -527,7 +141,7 @@ public void align3(String a, String b) { int step_left = ( btr < 0 ? -btr : 1); int step_up = ( btr > 0 ? btr : 1 ); - int new_state = -1; + int new_state; if ( btr > 0 ) new_state = DSTATE; else if ( btr < 0 ) new_state = ISTATE; else new_state = MSTATE; @@ -573,51 +187,13 @@ public void align3(String a, String b) { alignmentCigar = new Cigar(lce); } - private int w(char x, char y) { - if ( x == y ) return 2; // match - if ( x == '-' || y == '-' ) return -1; // gap - return -1; // mismatch - } - - private double wd ( char x, char y ) { + private double wd(byte x, byte y) { if ( x== y ) return w_match; else return w_mismatch; } - private double wk(char x, char y, int k) { + private double wk(int k) { return w_open+(k-1)*w_extend; // gap - // return -1.0 ; // no extension penalty - // return -1.0-Math.log(k+1); // weak extension penalty - } - - /** Returns index of the maximum element in array s. If there is a tie, and one of the tied indices is - * pref_id, then it will be preferred and returned. - * @param s - * @param pref_id - * @return - */ - private int findMaxInd(int[] s, int pref_id) { - int imax = 0; - int maxval = s[0]; - for ( int i = 1; i < s.length ; i++ ) { - if ( s[i] > maxval || i == pref_id && Math.abs(s[i] - maxval) < 0.0001 ) { - imax = i; - maxval = s[i]; - } - } - return imax; - } - - private int findMaxInd(double[] s, int pref_id) { - int imax = 0; - double maxval = s[0]; - for ( int i = 1; i < s.length ; i++ ) { - if ( s[i] > maxval + 0.0001 || i == pref_id && Math.abs(s[i] - maxval) < 0.0001 ) { - imax = i; - maxval = s[i]; - } - } - return imax; } private void print(int[][] s) { @@ -673,137 +249,4 @@ public void align3(String a, String b) { } } - public String toString() { - StringBuilder bmm = new StringBuilder(); - StringBuilder b1 = new StringBuilder(); - StringBuilder b2 = new StringBuilder(); - - int pos1 = 0; - int pos2 = 0; - if ( alignment_offset < 0 ) { - for ( ; pos2 < -alignment_offset ; pos2++ ) { - b1.append(' '); - b2.append(s2.charAt(pos2)); - bmm.append(' '); - } - // now pos2 = -alignment_offset; - } else { - for ( ; pos1 < alignment_offset ; pos1++ ) { - b2.append(' '); - b1.append(s1.charAt(pos1)); - bmm.append(' '); - } - // now pos1 = alignment_offset - } -/* debug prints: */ -// System.out.println(AlignmentUtils.toString(getCigar())); -// System.out.println("seq1l="+s1.length()+"; seq2l=" + s2.length()); -// System.out.println("offset="+alignment_offset); -// System.out.println("pos1="+pos1+"; pos2=" + pos2); -/**/ - for ( int i = 0 ; i < getCigar().numCigarElements() ; i++ ) { - CigarElement ce = getCigar().getCigarElement(i) ; - switch( ce.getOperator() ) { - case M: - int z = ( i == 0 ? pos2 : 0); // if we are in the first element and seq overhangs to the left, - // start inside the first segment, at the position where actual matches begin - // check separately for pos1 < s1.length() since seq2 is allowed to overhang beyond seq1's end - for ( ; z < ce.getLength() && pos1 < s1.length() ; z++ ) { -// System.out.println("pos1="+pos1+"; pos2="+pos2+"; k="+z); - if ( Character.toUpperCase(s1.charAt(pos1)) != - Character.toUpperCase(s2.charAt(pos2)) ) bmm.append('*'); - else bmm.append(' '); - b1.append(s1.charAt(pos1++)); - b2.append(s2.charAt(pos2++)); - } - break; - case I: - for ( int k = 0 ; k < ce.getLength() ; k++ ) { - b1.append('+'); - bmm.append('+'); - b2.append(s2.charAt(pos2++)); - } - break; - case D: - for ( int k = 0 ; k < ce.getLength() ; k++ ) { - b1.append(s1.charAt(pos1++)); - bmm.append('-'); - b2.append('-'); - } - break; - } - } - - bmm.append('\n'); - b1.append(s1,pos1,s1.length()); - bmm.append(b1); - bmm.append('\n'); - b2.append(s2,pos2,s2.length()); - bmm.append(b2); - bmm.append('\n'); - - return bmm.toString(); - } - - public static void testMe() { - /* String s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; - String s2 = "TGTATATAGGGTAAGG"; - - testMe(s1,s2); - - s1 = "GGTAAGGC"; - s2 = "GGTCTCAA"; - - testMe(s1,s2); - - s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; - s2 = "TGTTAGGGTCTCAAGG"; - testMe(s1,s2); - - - s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; - s2 = "TAGGGTAAGGCTGATCCATGTACCG" ; - testMe(s1,s2); - - s1 = "ACCTGGTGTATATAGGGTAAGGCTGAT"; - s2 = "CCGTATCATTACCTGGTGTATATAGG"; - testMe(s1,s2); - - s1 = "GGTGTATATAGGGT" ; - s2 = "TGTTAGGG"; - testMe(s1,s2); - - s1 = "AGACAGAGAGAAGG"; - s2 = "AGACAGAGAAGG"; - testMe(s1,s2); - */ - // String s1 = "CCAGCACACAGGTATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA"; - // String s2 = "CCAGCACACATCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTGTTTTTTGA"; - -// String s1 = "CCCATCTGTCTCCAATCTGCTGTTTTCCAAAAATTAGGGAACTTCAGTTTTCCCTTTGATACTCTGTATTTCTACCAACCACAACGCCAGGGCTGTCCTGCTTCTACAAGTGACAATGACAAATATAGGCCTGAAGGAAGATG"; -// String s2 = "AAAATCTGTTTCCAATCTACTGTTTTCCAAAAATTAGGGAAGTTCAGTTTTCCCTTTGATACTCTGTTTCTACCAATCC"; - String s1 = "CCCATCTGTCTCCAATCTGCTGTTTTCCAAAAATTAGGGAACTTCAGTTTTCCCTTTGATACTCTGTATTTCTACCAACCACAACGCCAGGGCTGTCCTGCTTCTACAAGTGACAATGACAAATATAGGCCTGAAGGAAGATG"; - String s2 = "AAAATCTGTCTCCAATCTACTGTTTTCCAAAAATTAGGGAAGTTCAGTTTTCCCTTTGATACTCTGTTTCTACCAATCC"; - testMe(s1,s2); - } - - public static void testMe(String s1, String s2) { - - SWPairwiseAlignment swpa = new SWPairwiseAlignment(s1,s2,3.0,-1.0,-4,-0.5); - - System.out.println(AlignmentUtils.toString(swpa.getCigar())); -// SequencePile sp = new SequencePile(s1); -// sp.addAlignedSequence(s2,false,swpa.getCigar(),swpa.getAlignmentStart2wrt1()); -// System.out.println(); -// System.out.println(sp.format()); - - System.out.println("--------\n"+swpa.toString()); - - //sp.colorprint(false); - } - - public static void main(String argv[]) { - if ( argv.length > 0 ) testMe(argv[0],argv[1]); - else testMe(); - } }