diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java index 755fa4ab2..6006e664b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java @@ -75,8 +75,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker readsToWrite = new TreeSet(); } - logger.info("Writing into output BAM file"); - logger.info("Temporary space used: "+System.getProperty("java.io.tmpdir")); generator = new Random(RANDOM_SEED); if ( OUT_INDELS != null ) { @@ -190,20 +188,20 @@ public class IntervalCleanerWalker extends LocusWindowWalker private static int mismatchQualitySumIgnoreCigar(AlignedRead aRead, String refSeq, int refIndex) { - String readSeq = aRead.getReadString(); - String quals = aRead.getBaseQualityString(); + byte[] readSeq = aRead.getRead().getReadBases(); + byte[] quals = aRead.getRead().getBaseQualities(); int sum = 0; - for (int readIndex = 0 ; readIndex < readSeq.length() ; refIndex++, readIndex++ ) { + for (int readIndex = 0 ; readIndex < readSeq.length ; refIndex++, readIndex++ ) { if ( refIndex >= refSeq.length() ) sum += MAX_QUAL; else { char refChr = refSeq.charAt(refIndex); - char readChr = readSeq.charAt(readIndex); + char readChr = (char)readSeq[readIndex]; if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) continue; // do not count Ns/Xs/etc ? if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) - sum += (int)quals.charAt(readIndex) - 33; + sum += (int)quals[readIndex]; } } return sum; @@ -263,7 +261,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker aRead.setMismatchScoreToReference(mismatchScore); // if it has an indel, let's see if that's the best consensus if ( numBlocks == 2 ) - altConsenses.add(createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getReadString())); + altConsenses.add(createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases())); else { // if ( debugOn ) System.out.println("Going to test..."); altAlignmentsToTest.add(aRead); @@ -280,8 +278,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { for ( AlignedRead aRead : altAlignmentsToTest ) { // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadString()); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); if ( c != null) { // if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ; altConsenses.add(c); @@ -296,8 +294,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker int index = generator.nextInt(altAlignmentsToTest.size()); AlignedRead aRead = altAlignmentsToTest.remove(index); // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadString()); + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadString(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); if ( c != null) altConsenses.add(c); } @@ -326,14 +324,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker else consensus.readIndexes.add(new Pair(j, altAlignment.first)); - logger.debug(consensus.str + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first); + //logger.debug(consensus.str + " vs. " + toTest.getRead().getReadString() + " => " + myScore + " - " + altAlignment.first); consensus.mismatchSum += myScore; } - logger.debug(consensus.str + " " + consensus.mismatchSum); + //logger.debug(consensus.str + " " + consensus.mismatchSum); if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { bestConsensus = consensus; - logger.debug(consensus.str + " " + consensus.mismatchSum); + //logger.debug(consensus.str + " " + consensus.mismatchSum); } } @@ -349,7 +347,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker AlignedRead aRead = altReads.get(indexPair.first); updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.second, aRead, (int)leftmostIndex); } - if( !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { + if ( !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { if ( statsOutput != null ) { try { statsOutput.write(interval.toString()); @@ -360,11 +358,11 @@ public class IntervalCleanerWalker extends LocusWindowWalker } catch (Exception e) {} } } else { - logger.debug("CLEAN: " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " " + bestConsensus.str ); + //logger.debug("CLEAN: " + AlignmentUtils.cigarToString(bestConsensus.cigar) + " " + bestConsensus.str ); if ( indelOutput != null && bestConsensus.cigar.numCigarElements() > 1 ) { // NOTE: indels are printed out in the format specified for the low-coverage pilot1 // indel calls (tab-delimited): chr position size type sequence - StringBuffer str = new StringBuffer(); + StringBuilder str = new StringBuilder(); str.append(reads.get(0).getReferenceName()); int position = bestConsensus.positionOnReference + bestConsensus.cigar.getCigarElement(0).getLength(); str.append("\t" + (leftmostIndex + position - 1)); @@ -436,14 +434,14 @@ public class IntervalCleanerWalker extends LocusWindowWalker } } - private Consensus createAlternateConsensus(int indexOnRef, Cigar c, String reference, String readStr) { + private Consensus createAlternateConsensus(int indexOnRef, Cigar c, String reference, byte[] readStr) { if ( indexOnRef < 0 ) return null; // create the new consensus - StringBuffer sb = new StringBuffer(); + StringBuilder sb = new StringBuilder(); sb.append(reference.substring(0, indexOnRef)); - logger.debug("CIGAR = " + AlignmentUtils.cigarToString(c)); + //logger.debug("CIGAR = " + AlignmentUtils.cigarToString(c)); int indelCount = 0; int altIdx = 0; @@ -451,22 +449,24 @@ public class IntervalCleanerWalker extends LocusWindowWalker boolean ok_flag = true; for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { CigarElement ce = c.getCigarElement(i); + int elementLength = ce.getLength(); switch( ce.getOperator() ) { case D: indelCount++; - refIdx += ce.getLength(); + refIdx += elementLength; break; case M: - if ( reference.length() < refIdx+ce.getLength() ) + if ( reference.length() < refIdx + elementLength ) ok_flag = false; else - sb.append(reference.substring(refIdx, refIdx+ce.getLength())); - refIdx += ce.getLength(); - altIdx += ce.getLength(); + sb.append(reference.substring(refIdx, refIdx + elementLength)); + refIdx += elementLength; + altIdx += elementLength; break; case I: - sb.append(readStr.substring(altIdx, altIdx+ce.getLength())); - altIdx += ce.getLength(); + for (int j = 0; j < elementLength; j++) + sb.append((char)readStr[altIdx + j]); + altIdx += elementLength; indelCount++; break; } @@ -582,9 +582,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker private boolean alternateReducesEntropy(List reads, String reference, long leftmostIndex) { int[] originalMismatchBases = new int[reference.length()]; int[] cleanedMismatchBases = new int[reference.length()]; - int[] totalBases = new int[reference.length()]; + int[] totalOriginalBases = new int[reference.length()]; + int[] totalCleanedBases = new int[reference.length()]; + + // set to 1 to prevent dividing by zero for ( int i=0; i < reference.length(); i++ ) - originalMismatchBases[i] = totalBases[i] = 0; + originalMismatchBases[i] = totalOriginalBases[i] = cleanedMismatchBases[i] = totalCleanedBases[i] = 1; for (int i=0; i < reads.size(); i++) { AlignedRead read = reads.get(i); @@ -592,18 +595,18 @@ public class IntervalCleanerWalker extends LocusWindowWalker continue; int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex; - String readStr = read.getReadString(); - String qualStr = read.getBaseQualityString(); + byte[] readStr = read.getRead().getReadBases(); + byte[] quals = read.getRead().getBaseQualities(); - for (int j=0; j < readStr.length(); j++, refIdx++ ) { + for (int j=0; j < readStr.length; j++, refIdx++ ) { if ( refIdx < 0 || refIdx >= reference.length() ) { //System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() ); //System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); break; } - totalBases[refIdx] += (int)qualStr.charAt(j) - 33; - if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) ) - originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33; + totalOriginalBases[refIdx] += quals[j]; + if ( Character.toUpperCase((char)readStr[j]) != Character.toUpperCase(reference.charAt(refIdx)) ) + originalMismatchBases[refIdx] += quals[j]; } // reset and now do the calculation based on the cleaning @@ -612,18 +615,22 @@ public class IntervalCleanerWalker extends LocusWindowWalker Cigar c = read.getCigar(); for (int j = 0 ; j < c.numCigarElements() ; j++) { CigarElement ce = c.getCigarElement(j); + int elementLength = ce.getLength(); switch ( ce.getOperator() ) { case M: - for (int k = 0 ; k < ce.getLength() ; k++, refIdx++, altIdx++ ) { - if ( refIdx < reference.length() && Character.toUpperCase(readStr.charAt(altIdx)) != Character.toUpperCase(reference.charAt(refIdx)) ) - cleanedMismatchBases[refIdx] += (int)qualStr.charAt(altIdx) - 33; + for (int k = 0 ; k < elementLength ; k++, refIdx++, altIdx++ ) { + if ( refIdx >= reference.length() ) + break; + totalCleanedBases[refIdx] += quals[altIdx]; + if ( Character.toUpperCase((char)readStr[altIdx]) != Character.toUpperCase(reference.charAt(refIdx)) ) + cleanedMismatchBases[refIdx] += quals[altIdx]; } break; case I: - altIdx += ce.getLength(); + altIdx += elementLength; break; case D: - refIdx += ce.getLength(); + refIdx += elementLength; break; } @@ -631,19 +638,19 @@ public class IntervalCleanerWalker extends LocusWindowWalker } int originalMismatchColumns = 0, cleanedMismatchColumns = 0; - StringBuffer sb = new StringBuffer(); + StringBuilder sb = new StringBuilder(); for ( int i=0; i < reference.length(); i++ ) { if ( cleanedMismatchBases[i] == originalMismatchBases[i] ) continue; boolean didMismatch = false, stillMismatches = false; - if ( originalMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) { + if ( originalMismatchBases[i] > totalOriginalBases[i] * MISMATCH_THRESHOLD ) { didMismatch = true; originalMismatchColumns++; - if ( cleanedMismatchBases[i] > originalMismatchBases[i] * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { + if ( (cleanedMismatchBases[i] / totalCleanedBases[i]) > (originalMismatchBases[i] / totalOriginalBases[i]) * (1.0 - MISMATCH_COLUMN_CLEANED_FRACTION) ) { stillMismatches = true; cleanedMismatchColumns++; } - } else if ( cleanedMismatchBases[i] > totalBases[i] * MISMATCH_THRESHOLD ) { + } else if ( cleanedMismatchBases[i] > totalCleanedBases[i] * MISMATCH_THRESHOLD ) { cleanedMismatchColumns++; } if ( snpsOutput != null ) { @@ -658,7 +665,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker } } - logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); + //logger.debug("Original mismatch columns = " + originalMismatchColumns + "; cleaned mismatch columns = " + cleanedMismatchColumns); boolean reduces = (originalMismatchColumns == 0 || cleanedMismatchColumns < originalMismatchColumns); if ( reduces && snpsOutput != null ) { @@ -774,11 +781,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M)); newCigar.add(ce2); newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M)); - // System.out.println(" FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex)); - // if ( ce2.getLength() >=2 ) - // System.out.println(" REALIGNED TO:\n"+AlignmentUtils.alignmentToString(newCigar,readSeq,refSeq,refIndex,readIndex)+"\n"); - logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar)); + //logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar)); cigar = newCigar; } @@ -801,10 +805,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker return read; } - public String getReadString() { - return read.getReadString(); - } - public int getReadLength() { return read.getReadLength(); } @@ -875,10 +875,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker return updated; } - public String getBaseQualityString() { - return read.getBaseQualityString(); - } - public void setMismatchScoreToReference(int score) { mismatchScoreToReference = score; } @@ -911,110 +907,4 @@ public class IntervalCleanerWalker extends LocusWindowWalker return ( this == c || this.str.equals(c.str) ); } } - - private void testCleanWithInsertion() { - String reference = "AAAAAACCCCCCAAAAAA"; - // the alternate reference is: "AAAAAACCCTTCCCAAAAAA"; - ArrayList reads = new ArrayList(); - SAMFileHeader header = getToolkit().getSAMFileHeader(); - SAMRecord r1 = new SAMRecord(header); - r1.setReadName("1"); - r1.setReadString("AACCCCCC"); - r1.setAlignmentStart(4); - r1.setBaseQualityString("BBBBBBBB"); - SAMRecord r2 = new SAMRecord(header); - r2.setReadName("2"); - r2.setReadString("AAAACCCT"); - r2.setAlignmentStart(2); - r2.setBaseQualityString("BBBBBBBB"); - SAMRecord r3 = new SAMRecord(header); - r3.setReadName("3"); - r3.setReadString("CTTC"); - r3.setAlignmentStart(10); - r3.setBaseQualityString("BBBB"); - SAMRecord r4 = new SAMRecord(header); - r4.setReadName("4"); - r4.setReadString("TCCCAA"); - r4.setAlignmentStart(8); - r4.setBaseQualityString("BBBBBB"); - SAMRecord r5 = new SAMRecord(header); - r5.setReadName("5"); - r5.setReadString("AAAGAACC"); - r5.setAlignmentStart(0); - r5.setBaseQualityString("BBBBBBBB"); - SAMRecord r6 = new SAMRecord(header); - r6.setReadName("6"); - r6.setReadString("CCAAAGAA"); - r6.setAlignmentStart(10); - r6.setBaseQualityString("BBBBBBBB"); - SAMRecord r7 = new SAMRecord(header); - r7.setReadName("7"); - r7.setReadString("AACCCTTCCC"); - r7.setAlignmentStart(4); - r7.setBaseQualityString("BBBBBBBBBB"); - reads.add(r1); - reads.add(r2); - reads.add(r3); - reads.add(r4); - reads.add(r5); - reads.add(r6); - reads.add(r7); - clean(reads, reference, GenomeLocParser.createGenomeLoc(0,0)); - } - - private void testCleanWithDeletion() { - String reference = "AAAAAACCCTTCCCAAAAAA"; - // the alternate reference is: "AAAAAACCCCCCAAAAAA"; - ArrayList reads = new ArrayList(); - SAMFileHeader header = getToolkit().getSAMFileHeader(); - SAMRecord r1 = new SAMRecord(header); - r1.setReadName("1"); - r1.setReadString("ACCCTTCC"); - r1.setAlignmentStart(5); - r1.setBaseQualityString("BBBBBBBB"); - SAMRecord r2 = new SAMRecord(header); - r2.setReadName("2"); - r2.setReadString("AAAACCCC"); - r2.setAlignmentStart(2); - r2.setBaseQualityString("BBBBBBBB"); - SAMRecord r3 = new SAMRecord(header); - r3.setReadName("3"); - r3.setReadString("CCCC"); - r3.setAlignmentStart(6); - r3.setBaseQualityString("BBBB"); - SAMRecord r4 = new SAMRecord(header); - r4.setReadName("4"); - r4.setReadString("CCCCAA"); - r4.setAlignmentStart(10); - r4.setBaseQualityString("BBBBBB"); - SAMRecord r5 = new SAMRecord(header); - r5.setReadName("5"); - r5.setReadString("AAAGAACC"); - r5.setAlignmentStart(0); - r5.setBaseQualityString("BBBBBBBB"); - SAMRecord r6 = new SAMRecord(header); - r6.setReadName("6"); - r6.setReadString("CCAAAGAA"); - r6.setAlignmentStart(10); - r6.setBaseQualityString("BBBBBBBB"); - SAMRecord r7 = new SAMRecord(header); - r7.setReadName("7"); - r7.setReadString("AAAACCCG"); - r7.setAlignmentStart(2); - r7.setBaseQualityString("BBBBBBBB"); - SAMRecord r8 = new SAMRecord(header); - r8.setReadName("8"); - r8.setReadString("AACCCCCC"); - r8.setAlignmentStart(4); - r8.setBaseQualityString("BBBBBBBB"); - reads.add(r1); - reads.add(r2); - reads.add(r3); - reads.add(r4); - reads.add(r5); - reads.add(r6); - reads.add(r7); - reads.add(r8); - clean(reads, reference, GenomeLocParser.createGenomeLoc(0,0)); - } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerIntegrationTest.java index ebb014c37..08d1675fa 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerIntegrationTest.java @@ -9,21 +9,21 @@ public class IntervalCleanerIntegrationTest extends WalkerTest { @Test public void testIntervals() { - String[] md5lod5 = {"163d2f1b04741986151636d3c7e04c94", "4aa3637e86822c95af3e2c9b414530c3"}; + String[] md5lod5 = {"042c1c2649a51a260bc204ec5f256c1a", "460631e8d98644dcf53b3045ca40f02a"}; WalkerTestSpec spec1 = new WalkerTestSpec( "-T IntervalCleaner -LOD 5 -maxConsensuses 100 -greedy 100 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chrom1.SLX.SRP000032.2009_06.bam -L /humgen/gsa-scr1/GATK_Data/Validation_Data/cleaner.test.intervals --OutputCleaned %s -snps %s", 2, Arrays.asList(md5lod5)); executeTest("testLod5", spec1); - String[] md5lod200 = {"1d89ee2af03df79eb5de494c77767221", "e39aa718c9810364ebe30964d878d5ff"}; + String[] md5lod200 = {"1d89ee2af03df79eb5de494c77767221", "6137bf0c25c7972b07b0d3fc6979cf5b"}; WalkerTestSpec spec2 = new WalkerTestSpec( "-T IntervalCleaner -LOD 200 -maxConsensuses 100 -greedy 100 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chrom1.SLX.SRP000032.2009_06.bam -L /humgen/gsa-scr1/GATK_Data/Validation_Data/cleaner.test.intervals --OutputCleaned %s -snps %s", 2, Arrays.asList(md5lod200)); executeTest("testLod200", spec2); - String[] md5cleanedOnly = {"710f9114ec496c73f1c3782b5cb09757", "4aa3637e86822c95af3e2c9b414530c3"}; + String[] md5cleanedOnly = {"168c8d02fceb107477381b93d189fe1f", "460631e8d98644dcf53b3045ca40f02a"}; WalkerTestSpec spec3 = new WalkerTestSpec( "-T IntervalCleaner -LOD 5 -cleanedOnly -maxConsensuses 100 -greedy 100 -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.chrom1.SLX.SRP000032.2009_06.bam -L /humgen/gsa-scr1/GATK_Data/Validation_Data/cleaner.test.intervals --OutputCleaned %s -snps %s", 2,