From 1dd9996f3a2339a9a197cd5341d9b7d38fd879e9 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 28 Jan 2010 04:17:20 +0000 Subject: [PATCH] New realigner now completely uses bytes, plus misc fixes. Still not ready for use. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2719 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/indels/IndelRealigner.java | 78 ++++++++++++------- .../walkers/indels/IntervalCleanerWalker.java | 2 +- .../broadinstitute/sting/utils/BaseUtils.java | 10 +-- 3 files changed, 55 insertions(+), 35 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 953ec070a..aee0bf0cd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -19,7 +19,7 @@ import java.io.FileWriter; */ public class IndelRealigner extends ReadWalker { - @Argument(fullName="intervals", shortName="intervals", doc="intervals file output from RealignerTargetCreator", required=true) + @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) protected String intervalsFile = null; @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) @@ -33,8 +33,6 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found") String OUT_INDELS = null; - @Argument(fullName="OutputCleanedReadsOnly", shortName="cleanedOnly", doc="print out cleaned reads only (otherwise, all reads within the intervals)", required=false) - boolean cleanedReadsOnly = false; @Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false) String OUT_STATS = null; @Argument(fullName="SNPsFile", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out", required=false) @@ -47,6 +45,8 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) int MAX_READS = 20000; + @Argument(fullName="writerWindowSize", shortName="writerWindowSize", doc="the window over which the writer will store reads", required=false) + protected int SORTING_WRITER_WINDOW = 100; // the intervals input by the user private Iterator intervals = null; @@ -56,6 +56,7 @@ public class IndelRealigner extends ReadWalker { // the reads that fall into the current interval private ReadBin readsToClean = new ReadBin(); + private ArrayList readsNotToClean = new ArrayList(); // the wrapper around the SAM writer private SortingSAMFileWriter writer = null; @@ -63,8 +64,8 @@ public class IndelRealigner extends ReadWalker { // random number generator private Random generator; - public static final int MAX_QUAL = 99; - public static final long RANDOM_SEED = 1252863495; + private static final int MAX_QUAL = 99; + private static final long RANDOM_SEED = 1252863495; // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75; @@ -91,7 +92,7 @@ public class IndelRealigner extends ReadWalker { currentInterval = intervals.hasNext() ? intervals.next() : null; if ( baseWriter != null ) - writer = new SortingSAMFileWriter(baseWriter, 50); + writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW); generator = new Random(RANDOM_SEED); @@ -141,6 +142,9 @@ public class IndelRealigner extends ReadWalker { } GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = GenomeLocParser.createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); if ( readLoc.isBefore(currentInterval) ) { emit(read); @@ -153,20 +157,29 @@ public class IndelRealigner extends ReadWalker { read.getMappingQuality() == 0 || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || Utils.is454Read(read) ) { - emit(read); + readsNotToClean.add(read); + } else { + readsToClean.add(read, ref); } - readsToClean.add(read, ref); - if ( readsToClean.size() >= MAX_READS ) { - emit(readsToClean.getReads()); + if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) { + // merge the two sets for emission + readsNotToClean.addAll(readsToClean.getReads()); + emit(readsNotToClean); readsToClean.clear(); + readsNotToClean.clear(); currentInterval = intervals.hasNext() ? intervals.next() : null; } } else { // the read is past the current interval clean(readsToClean); - emit(readsToClean.getReads()); + + // merge the two sets for emission + readsNotToClean.addAll(readsToClean.getReads()); + emit(readsNotToClean); readsToClean.clear(); + readsNotToClean.clear(); + currentInterval = intervals.hasNext() ? intervals.next() : null; // call back into map now that the state has been updated @@ -185,9 +198,12 @@ public class IndelRealigner extends ReadWalker { } public void onTraversalDone(Integer result) { - if ( readsToClean.size() > 0 ) { + if ( readsToClean.size() > 0 || readsNotToClean.size() > 0 ) { clean(readsToClean); - emit(readsToClean.getReads()); + + // merge the two sets for emission + readsNotToClean.addAll(readsToClean.getReads()); + emit(readsNotToClean); } writer.close(); @@ -215,7 +231,6 @@ public class IndelRealigner extends ReadWalker { } } - private static int mismatchQualitySumIgnoreCigar(AlignedRead aRead, byte[] refSeq, int refIndex) { byte[] readSeq = aRead.getRead().getReadBases(); byte[] quals = aRead.getRead().getBaseQualities(); @@ -229,7 +244,7 @@ public class IndelRealigner extends ReadWalker { if ( BaseUtils.simpleBaseToBaseIndex(readChr) == -1 || BaseUtils.simpleBaseToBaseIndex(refChr) == -1 ) continue; // do not count Ns/Xs/etc ? - if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) + if ( readChr != refChr ) sum += (int)quals[readIndex]; } } @@ -247,6 +262,9 @@ public class IndelRealigner extends ReadWalker { private void clean(ReadBin readsToClean) { List reads = readsToClean.getReads(); + if ( reads.size() == 0 ) + return; + byte[] reference = readsToClean.getRereference(); long leftmostIndex = readsToClean.getLocation().getStart(); @@ -409,10 +427,10 @@ public class IndelRealigner extends ReadWalker { int length = ce.getLength(); if ( ce.getOperator() == CigarOperator.D ) { for ( int i = 0; i < length; i++) - str.append(reference[position+i]); + str.append((char)reference[position+i]); } else { for ( int i = 0; i < length; i++) - str.append(bestConsensus.str[position+i]); + str.append((char)bestConsensus.str[position+i]); } str.append("\t" + (((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0) + "\n"); try { @@ -469,7 +487,7 @@ public class IndelRealigner extends ReadWalker { // create the new consensus StringBuilder sb = new StringBuilder(); for (int i = 0; i < indexOnRef; i++) - sb.append(reference[i]); + sb.append((char)reference[i]); //logger.debug("CIGAR = " + AlignmentUtils.cigarToString(c)); int indelCount = 0; @@ -489,7 +507,7 @@ public class IndelRealigner extends ReadWalker { ok_flag = false; else { for (int j = 0; j < elementLength; j++) - sb.append(reference[refIdx+j]); + sb.append((char)reference[refIdx+j]); } refIdx += elementLength; altIdx += elementLength; @@ -507,7 +525,7 @@ public class IndelRealigner extends ReadWalker { return null; for (int i = refIdx; i < reference.length; i++) - sb.append(reference[i]); + sb.append((char)reference[i]); byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the cuurent read // if ( debugOn ) System.out.println("Alt consensus generated: "+altConsensus); @@ -741,15 +759,15 @@ public class IndelRealigner extends ReadWalker { int difference = 0; // we can move indel 'difference' bases left final int indel_length = ce2.getLength(); - String indelString; // inserted or deleted sequence int period = 0; // period of the inserted/deleted sequence int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion) int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion) + byte[] indelString = new byte[ce2.getLength()]; // inserted or deleted sequence if ( ce2.getOperator() == CigarOperator.D ) - indelString = new String(refSeq, indelIndexOnRef, ce2.getLength()).toUpperCase(); // deleted bases + System.arraycopy(refSeq, indelIndexOnRef, indelString, 0, ce2.getLength()); else if ( ce2.getOperator() == CigarOperator.I ) - indelString = new String(readSeq, indelIndexOnRead, ce2.getLength()).toUpperCase(); // get the inserted bases + System.arraycopy(readSeq, indelIndexOnRead, indelString, 0, ce2.getLength()); else // we can get here if there is soft clipping done at the beginning of the read // for now, we'll just punt the issue and not try to realign these @@ -790,8 +808,8 @@ public class IndelRealigner extends ReadWalker { boolean match = true; for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) { - char indelChr = indelString.charAt(indelPos); - if ( Character.toUpperCase(refSeq[testRefPos]) != indelChr || BaseUtils.simpleBaseToBaseIndex(indelChr) == -1 ) { + byte indelChr = indelString[indelPos]; + if ( refSeq[testRefPos] != indelChr || BaseUtils.simpleBaseToBaseIndex(indelChr) == -1 ) { match = false; break; } @@ -1003,10 +1021,12 @@ public class IndelRealigner extends ReadWalker { } else { long lastPosWithRefBase = loc.getStart() + reference.length -1; int neededBases = (int)(read.getAlignmentEnd() - lastPosWithRefBase); - char[] newReference = new char[reference.length + neededBases]; - System.arraycopy(reference, 0, newReference, 0, reference.length); - System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases); - reference = newReference; + if ( neededBases > 0 ) { + char[] newReference = new char[reference.length + neededBases]; + System.arraycopy(reference, 0, newReference, 0, reference.length); + System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases); + reference = newReference; + } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java index 0b63c5bb1..f66f4fa01 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalCleanerWalker.java @@ -748,7 +748,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength - period = BaseUtils.sequencePeriod(indelString, period+1); + period = BaseUtils.sequencePeriod(StringUtil.stringToBytes(indelString), period+1); if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index d6738e7e0..10a3e649e 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -505,18 +505,18 @@ public class BaseUtils { * @param seq * @return */ - public static int sequencePeriod(String seq, int minPeriod) { - int period = ( minPeriod > seq.length() ? seq.length() : minPeriod ); + public static int sequencePeriod(byte[] seq, int minPeriod) { + int period = ( minPeriod > seq.length ? seq.length : minPeriod ); // we assume that bases [0,period-1] repeat themselves and check this assumption // until we find correct period - for ( int pos = period ; pos < seq.length() ; pos++ ) { + for ( int pos = period ; pos < seq.length ; pos++ ) { int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' // if our current hypothesis holds, base[pos] must be the same as base[offset] - if ( Character.toUpperCase( seq.charAt(pos) ) != - Character.toUpperCase( seq.charAt( offset ) ) + if ( Character.toUpperCase( seq[pos] ) != + Character.toUpperCase( seq[offset] ) ) { // period we have been trying so far does not work.