diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index f09dc9f67..0ec0e0e8c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.*; import net.sf.samtools.util.StringUtil; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; @@ -35,8 +36,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; @@ -54,7 +53,7 @@ import java.util.*; * Unlike most mappers, this walker uses the full alignment context to determine whether an * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. */ -@Reference(window=@Window(start=-1,stop=150)) +//Reference(window=@Window(start=-30,stop=30)) public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_CIGAR_TAG = "OC"; @@ -124,6 +123,9 @@ public class IndelRealigner extends ReadWalker { @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) protected String OUT_SNPS = null; + // fasta reference reader to supplement the edges of the reference sequence + private IndexedFastaSequenceFile referenceReader; + // the intervals input by the user private Iterator intervals = null; @@ -152,6 +154,10 @@ public class IndelRealigner extends ReadWalker { private static final double SW_GAP = -10.0; //-1.0-1.0/3.0; private static final double SW_GAP_EXTEND = -2.0; //-1.0/.0; + // reference base padding size + // TODO -- make this a command-line argument if the need arises + private static final int REFERENCE_PADDING = 30; + // other output files private FileWriter indelOutput = null; private FileWriter statsOutput = null; @@ -164,6 +170,8 @@ public class IndelRealigner extends ReadWalker { if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); + referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + if ( !TARGET_NOT_SORTED && IntervalUtils.isIntervalFile(intervalsFile)) { // prepare to read intervals one-by-one, as needed (assuming they are sorted). intervals = new IntervalFileMergingIterator( new java.io.File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); @@ -282,11 +290,7 @@ public class IndelRealigner extends ReadWalker { if ( doNotTryToClean(read) ) { readsNotToClean.add(read); } else { - boolean wellCoveredInterval = readsToClean.add(read, ref.getBases()); - if ( !wellCoveredInterval ) { - logger.info("We are aborting the realignment in interval " + currentInterval + " because it is not fully covered by reads"); - abortCleanForCurrentInterval(); - } + readsToClean.add(read); // add the rods to the list of known variants populateKnownIndels(metaDataTracker, ref); @@ -436,7 +440,7 @@ public class IndelRealigner extends ReadWalker { if ( reads.size() == 0 ) return; - final byte[] reference = readsToClean.getRereference(); + final byte[] reference = readsToClean.getReference(referenceReader); final long leftmostIndex = readsToClean.getLocation().getStart(); final ArrayList refReads = new ArrayList(); // reads that perfectly match ref @@ -532,7 +536,7 @@ public class IndelRealigner extends ReadWalker { if ( !alternateReducesEntropy(altReads, reference, leftmostIndex) ) { if ( statsOutput != null ) { try { - statsOutput.write(readsToClean.getLocation().toString()); + statsOutput.write(currentInterval.toString()); statsOutput.write("\tFAIL (bad indel)\t"); // if improvement > LOD_THRESHOLD *BUT* entropy is not reduced (SNPs still exist) statsOutput.write(Double.toString(improvement)); statsOutput.write("\n"); @@ -570,7 +574,7 @@ public class IndelRealigner extends ReadWalker { } if ( statsOutput != null ) { try { - statsOutput.write(readsToClean.getLocation().toString()); + statsOutput.write(currentInterval.toString()); statsOutput.write("\tCLEAN"); // if improvement > LOD_THRESHOLD *AND* entropy is reduced if ( bestConsensus.cigar.numCigarElements() > 1 ) statsOutput.write(" (found indel)"); @@ -602,7 +606,7 @@ public class IndelRealigner extends ReadWalker { } else if ( statsOutput != null ) { try { statsOutput.write(String.format("%s\tFAIL\t%.1f%n", - readsToClean.getLocation().toString(), improvement)); + currentInterval.toString(), improvement)); statsOutput.flush(); } catch (Exception e) { throw new StingException(e.getMessage()); @@ -902,8 +906,8 @@ public class IndelRealigner extends ReadWalker { readCigar.add(new CigarElement(endOfFirstBlock - myPosOnAlt, CigarOperator.M)); } - int indelOffsetOnRef = 0, indelOffsetOnRead = 0; // forward along the indel + //int indelOffsetOnRef = 0, indelOffsetOnRead = 0; if ( indelCE.getOperator() == CigarOperator.I ) { // for reads that end in an insertion if ( myPosOnAlt + aRead.getReadLength() < endOfFirstBlock + indelCE.getLength() ) { @@ -916,16 +920,16 @@ public class IndelRealigner extends ReadWalker { if ( !sawAlignmentStart && myPosOnAlt < endOfFirstBlock + indelCE.getLength() ) { aRead.setAlignmentStart(leftmostIndex + endOfFirstBlock); readCigar.add(new CigarElement(indelCE.getLength() - (myPosOnAlt - endOfFirstBlock), CigarOperator.I)); - indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; + //indelOffsetOnRead = myPosOnAlt - endOfFirstBlock; sawAlignmentStart = true; } else if ( sawAlignmentStart ) { readCigar.add(indelCE); - indelOffsetOnRead = indelCE.getLength(); + //indelOffsetOnRead = indelCE.getLength(); } } else if ( indelCE.getOperator() == CigarOperator.D ) { if ( sawAlignmentStart ) readCigar.add(indelCE); - indelOffsetOnRef = indelCE.getLength(); + //indelOffsetOnRef = indelCE.getLength(); } // for reads that start after the indel @@ -1297,33 +1301,27 @@ public class IndelRealigner extends ReadWalker { // Return false if we can't process this read bin because the reads are not correctly overlapping. // This can happen if e.g. there's a large known indel with no overlapping reads. - public boolean add(SAMRecord read, byte[] ref) { + public void add(SAMRecord read) { + + GenomeLoc locForRead = GenomeLocParser.createGenomeLoc(read); + if ( loc == null ) + loc = locForRead; + else if ( locForRead.getStop() > loc.getStop() ) + loc = GenomeLocParser.createGenomeLoc(loc.getContigIndex(), loc.getStart(), locForRead.getStop()); + reads.add(read); - - // set up the reference - if ( reference == null ) { - reference = ref; - loc = GenomeLocParser.createGenomeLoc(read); - } else { - long lastPosWithRefBase = loc.getStart() + reference.length -1; - int neededBases = (int)(read.getAlignmentEnd() - lastPosWithRefBase); - if ( neededBases > ref.length ) - return false; - if ( neededBases > 0 ) { - byte[] newReference = new byte[reference.length + neededBases]; - System.arraycopy(reference, 0, newReference, 0, reference.length); - System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases); - reference = newReference; - loc = GenomeLocParser.createGenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStop()+neededBases); - } - } - - return true; } public List getReads() { return reads; } - public byte[] getRereference() { + public byte[] getReference(IndexedFastaSequenceFile referenceReader) { + // set up the reference if we haven't done so yet + if ( reference == null ) { + // first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read) + loc = GenomeLocParser.createGenomeLoc(loc.getContigIndex(), loc.getStart()-REFERENCE_PADDING, loc.getStop()+REFERENCE_PADDING); + reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + } + return reference; } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index eea1a4ca2..8458e55c5 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -9,7 +9,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testRealignerLod5() { - String[] md5s = {"9247b1437ec3b9bc96590524f245220c", "18fca887d1eb7dc300e717ae03b9da62"}; + String[] md5s = {"9247b1437ec3b9bc96590524f245220c", "c4ef635f2597b12b93a73199f07e509b"}; WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10030000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", 2, @@ -19,7 +19,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testRealignerLod50() { - String[] md5s = {"9247b1437ec3b9bc96590524f245220c", "9537e4f195ce5840136f60fb61201369"}; + String[] md5s = {"9247b1437ec3b9bc96590524f245220c", "3735a510513b6fa4161d92155e026283"}; WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -noPG -LOD 50 -maxConsensuses 100 -greedy 100 -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10030000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", 2, @@ -29,7 +29,7 @@ public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testRealignerKnownsOnly() { - String[] md5s = {"7084d4e543bc756730ab306768028530", "1091436c40d5ba557d85662999cc0c1d"}; + String[] md5s = {"7084d4e543bc756730ab306768028530", "74652bd8240291293ec921f8ecfa1622"}; WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -noPG -LOD 1.0 -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf4 -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly", 2,