Killed the old cleaner code. Bye bye.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2868 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-02-22 20:49:58 +00:00
parent 3738b76320
commit 8b555ff17c
6 changed files with 27 additions and 1503 deletions

View File

@ -1,246 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* User: hanna
* Date: Jun 10, 2009
* Time: 2:40:19 PM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Copies reads from the input stream into the outputBAM, replacing those
* reads which have been cleaned/realigned with their new clean copies.
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
/**
* The source of all cleaned reads.
*/
@Argument(fullName="cleaned_reads",shortName="cr",doc="BAM file of reads which have been cleaned.",required=true)
SAMFileReader cleanedReadsSource = null;
/**
* Target file for BAM output.
*/
@Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true)
SAMFileWriter outputBAM = null;
/**
* The iterator for the cleaned reads
*/
private ByContigIterator cleanedReadsIterator;
/**
* The set of (sorted) cleaned reads
*/
private Queue<SAMRecord> cleanedReads = new LinkedList<SAMRecord>();
/**
* The intervals specified by the user
*/
private HashMap<String, ArrayList<GenomeLoc>> intervals = null;
/**
* A fast lookup table for uniquified read info
*/
private HashSet<String> cleanedReadHash = new HashSet<String>();
@Override
public void initialize() {
// For now, read the whole file into memory a chromosome at a time (because we can
// never clean from one chromosome to another). If We ever have time to do this correctly
// (or it becomes a memory problem), then we'll need to read the hash into memory and the
// first read; we'd then query the BAM file every time we needed to update the cleaned read iterator.
cleanedReadsIterator = new ByContigIterator(cleanedReadsSource.iterator());
// If there are intervals specified by the user, record them so we can make sure not
// to emit reads outside the intervals. For now, we'll group them by chromosome to
// make lookup a bit faster.
if ( this.getToolkit() != null &&
this.getToolkit().getArguments().intervals != null ) {
intervals = new HashMap<String, ArrayList<GenomeLoc>>();
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(this.getToolkit().getArguments().intervals);
Iterator<GenomeLoc> iter = GenomeLocSortedSet.createSetFromList(locs).iterator();
while ( iter.hasNext() ) {
GenomeLoc loc = iter.next();
if ( intervals.get(loc.getContig()) == null )
intervals.put(loc.getContig(), new ArrayList<GenomeLoc>());
intervals.get(loc.getContig()).add(loc);
}
}
}
/**
* For every read, if it exists in the cleaned read set, ignore it; otherwise, emit it.
* Also, if the head of the cleaned read list could be emitted here, do so.
* @param ref Portion of the reference sequence overlapping this read.
* @param read The read to inspect.
* @return Number of reads cleaned by this iteration of the map function.
*/
@Override
public Integer map(char[] ref, SAMRecord read) {
// boolean DEBUG = false;
// if ( read.getReadName().equals("42NY6AAXX091009:6:73:327:916#0") ) {
// System.out.println(read.toString());
// System.out.println(getUniquifiedReadName(read)) ;
// DEBUG=true;
// }
// first emit reads from the cleaned set if appropriate
int cleanedReadCount = 0;
SAMRecord firstCleanedRead = cleanedReads.peek();
while ( firstCleanedRead != null &&
firstCleanedRead.getReferenceIndex() <= read.getReferenceIndex() &&
firstCleanedRead.getAlignmentStart() <= read.getAlignmentStart() ) {
if ( emit(firstCleanedRead) )
cleanedReadCount++;
cleanedReads.remove();
firstCleanedRead = cleanedReads.peek();
}
// update the hashes if necessary; NOTE: cleanedReadsIterator was intitialized with the
// contig where the first available cleaned read sits (so it can be any contig, e.g. 22);
// the 'read' below comes from a different file (original, uncleaned reads, so it may also
// be anywhere - including lower contig, say 1. cleanedReadIterator can not scroll back,
// so the statement below will take care of skipping reads on lower contigs until we reach
// the contig of the first cleaned read.
if ( read.getReferenceIndex() > cleanedReadsIterator.getCurrentContig() ) {
cleanedReadsIterator.readNextContig(read.getReferenceIndex());
}
if ( !cleanedReadHash.contains(getUniquifiedReadName(read)) ) {
// if ( DEBUG ) System.out.println("Not found in hash. Hash size:"+cleanedReadHash.size());
outputBAM.addAlignment(read);
}
return cleanedReadCount;
}
/**
* Determine whether to emit the given read; if so, return true.
*/
private boolean emit(SAMRecord read) {
// if no intervals were specified, emit everything
if ( intervals == null ) {
outputBAM.addAlignment(read);
return true;
}
ArrayList<GenomeLoc> intervalList = intervals.get(read.getReferenceName());
if ( intervalList == null )
return false;
GenomeLoc readLoc = GenomeLocParser.createGenomeLoc(read);
for ( GenomeLoc interval : intervalList ) {
// if it overlaps an interval, then we can emit it
if ( interval.overlapsP(readLoc) ) {
outputBAM.addAlignment(read);
return true;
}
// once we've passed any interval that could overlap it, just quit
if ( interval.isPast(readLoc) )
return false;
}
// it didn't overlap an interval
return false;
}
/**
* Initialize traversal with number of reads which have been replaced with a clean version.
* @return 0 to initialize the traversal.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Accumulates the number of reads which have been replaced by their clean equivalents.
* @param value Number of reads cleaned by this iteration.
* @param accum Prior number of reads cleaned.
* @return Total number of reads cleaned, including this iteration.
*/
@Override
public Integer reduce( Integer value, Integer accum ) {
return accum + value;
}
@Override
public void onTraversalDone( Integer value ) {
cleanedReadsIterator.iterator.close();
}
/**
* Gets the distinguishing elements of the read name from the given read.
* @param read read to uniquify.
* @return A (hopefully) completely unique name for the read.
*/
private static String getUniquifiedReadName( SAMRecord read ) {
return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString());
}
private class ByContigIterator {
SAMRecord nextRead;
CloseableIterator<SAMRecord> iterator;
int contig = -1;
public ByContigIterator(CloseableIterator<SAMRecord> iterator) {
this.iterator = iterator;
nextRead = (iterator.hasNext() ? iterator.next() : null);
if ( nextRead != null )
readNextContig(nextRead.getReferenceIndex());
}
int getCurrentContig() { return contig; }
public void readNextContig(int newContig) {
if ( newContig < contig )
throw new StingException("Requested shift to contig "+newContig+" which is before the current contig "+contig);
// don't do anything if we're in the right contig or have no reads
if ( newContig == contig || nextRead == null )
return;
System.out.println("Loading contig "+newContig+"; old contig was "+contig);
contig = newContig;
cleanedReadHash.clear();
cleanedReads.clear();
// first, get to the right contig
while ( nextRead != null &&
nextRead.getReferenceIndex() != contig ) {
nextRead = (iterator.hasNext() ? iterator.next() : null);
}
// now, read in all of the reads for this contig
while ( nextRead != null &&
nextRead.getReferenceIndex() == contig ) {
cleanedReads.add(nextRead);
cleanedReadHash.add(getUniquifiedReadName(nextRead));
// if ( nextRead.getReadName().equals("42NY6AAXX091009:6:73:327:916#0") ) {
// System.out.println("In hash: "+getUniquifiedReadName(nextRead));
// System.out.println("In hash: "+nextRead.toString());
// }
nextRead = (iterator.hasNext() ? iterator.next() : null);
}
}
}
}

View File

@ -1,199 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.FileNotFoundException;
import java.io.File;
import java.util.Arrays;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import junit.framework.Assert;
/**
* User: hanna
* Date: Jun 11, 2009
* Time: 9:54:05 AM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Test the walker that injects clean reads into the bam file.
*/
public class CleanedReadInjectorTest extends BaseTest {
/**
* The fasta, for comparison.
*/
protected static IndexedFastaSequenceFile sequenceFile = null;
/**
* Initialize the fasta.
* @throws FileNotFoundException if fasta or index is not found.
*/
@BeforeClass
public static void initialize() throws FileNotFoundException {
sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") );
GenomeLocParser.setupRefContigOrdering(sequenceFile);
}
@Test
public void testNoReads() {
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader();
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
CleanedReadInjector walker = createWalker( cleanedReads, output );
walker.initialize();
walker.onTraversalDone(0);
Assert.assertEquals("Too many records in output",0,output.getRecords().size());
}
@Test
public void testNoCleanedReads() {
SAMFileHeader header = getMockSAMFileHeader();
SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,5);
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader();
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
CleanedReadInjector walker = createWalker( cleanedReads, output );
int result = runWalkerOverReads(walker,sourceRead);
Assert.assertEquals("Result of traversal is incorrect",0,result);
Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size());
Assert.assertEquals("Output record is incorrect",sourceRead,output.getRecords().get(0));
}
@Test
public void testOneCleanedRead() {
SAMFileHeader header = getMockSAMFileHeader();
SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
CleanedReadInjector walker = createWalker( cleanedReads, output );
int result = runWalkerOverReads(walker,sourceRead);
Assert.assertEquals("Result of traversal is incorrect",1,result);
Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size());
Assert.assertEquals("Output record is incorrect",cleanedRead,output.getRecords().get(0));
}
@Test
public void testPartialIntervalOverlap() {
SAMFileHeader header = getMockSAMFileHeader();
SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
CleanedReadInjector walker = createWalker( cleanedReads, output );
int result = runWalkerOverReads(walker,sourceRead);
Assert.assertEquals("Result of traversal is incorrect",1,result);
Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size());
Assert.assertEquals("Output record is incorrect",cleanedRead,output.getRecords().get(0));
}
@Test
public void testMixedCleanedAndUncleanedNonOverlapping() {
SAMFileHeader header = getMockSAMFileHeader();
SAMRecord[] sourceReads = new SAMRecord[] { ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5),
ArtificialSAMUtils.createArtificialRead(header,"read2",1,2,5),
ArtificialSAMUtils.createArtificialRead(header,"read3",1,3,5),
ArtificialSAMUtils.createArtificialRead(header,"read4",1,4,5),
ArtificialSAMUtils.createArtificialRead(header,"read5",1,5,5) };
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,1);
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
CleanedReadInjector walker = createWalker( cleanedReads, output );
int result = runWalkerOverReads(walker,sourceReads);
Assert.assertEquals("Result of traversal is incorrect",1,result);
Assert.assertEquals("Incorrect number of records in output",5,output.getRecords().size());
for( int i = 0; i < sourceReads.length; i++ ) {
if( i == 0 )
Assert.assertEquals("Output record is incorrect",cleanedRead,output.getRecords().get(i));
else
Assert.assertEquals("Output record is incorrect",sourceReads[i],output.getRecords().get(i));
}
}
@Test
public void testAlternateReadOrder() {
SAMFileHeader header = getMockSAMFileHeader();
SAMRecord[] sourceReads = new SAMRecord[] { ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5),
ArtificialSAMUtils.createArtificialRead(header,"read2",1,2,5),
ArtificialSAMUtils.createArtificialRead(header,"read3",1,3,5) };
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,3,1);
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
CleanedReadInjector walker = createWalker( cleanedReads, output );
int result = runWalkerOverReads(walker,sourceReads);
Assert.assertEquals("Result of traversal is incorrect",1,result);
Assert.assertEquals("Incorrect number of records in output",3,output.getRecords().size());
Assert.assertEquals("Incorrect read at position 1",sourceReads[1],output.getRecords().get(0));
Assert.assertEquals("Incorrect read at position 2",cleanedRead,output.getRecords().get(1));
Assert.assertEquals("Incorrect read at position 3",sourceReads[2],output.getRecords().get(2));
}
private CleanedReadInjector createWalker( ArtificialSAMFileReader cleanedReads, ArtificialSAMFileWriter output ) {
CleanedReadInjector walker = new CleanedReadInjector();
walker.cleanedReadsSource = cleanedReads;
walker.outputBAM = output;
return walker;
}
private SAMFileHeader getMockSAMFileHeader() {
return ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719);
}
private Integer runWalkerOverReads( CleanedReadInjector walker, SAMRecord... reads ) {
walker.initialize();
Integer accum = walker.reduceInit();
for( SAMRecord sourceRead: reads ) {
Integer value = walker.map( null, sourceRead );
accum = walker.reduce(value,accum);
}
walker.onTraversalDone(accum);
return accum;
}
private byte[] getMockBaseQualityString( byte value, int length ) {
byte[] baseQualities = new byte[length];
Arrays.fill(baseQualities,value);
return baseQualities;
}
}

View File

@ -0,0 +1,26 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class IndelRealignerIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {
String[] md5lod5 = {"67c3fc25e9d192cc5fbfd48ade0efc84", "86778f92b0fa6aa7c26e651c8c1eb320"};
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T IndelRealigner -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023800-10332350 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -snps %s",
2,
Arrays.asList(md5lod5));
executeTest("testLod5", spec1);
String[] md5lod200 = {"96edef86cea95f312ee8295b38227eb8", "d4d8ff567b614729ab8c52bd7d6bef48"};
WalkerTestSpec spec2 = new WalkerTestSpec(
"-T IndelRealigner -LOD 200 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023800-10332350 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -snps %s",
2,
Arrays.asList(md5lod200));
executeTest("testLod200", spec2);
}
}

View File

@ -1,33 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class IntervalCleanerIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {
String[] md5lod5 = {"3c4339cb2a258d1c67bf4d930fe0055a", "86778f92b0fa6aa7c26e651c8c1eb320"};
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T IntervalCleaner -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L " + validationDataLocation + "cleaner.test.intervals --OutputCleaned %s -snps %s",
2,
Arrays.asList(md5lod5));
executeTest("testLod5", spec1);
String[] md5lod200 = {"32401cef2134d973ff0037df27f1dcca", "d4d8ff567b614729ab8c52bd7d6bef48"};
WalkerTestSpec spec2 = new WalkerTestSpec(
"-T IntervalCleaner -LOD 200 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L " + validationDataLocation + "cleaner.test.intervals --OutputCleaned %s -snps %s",
2,
Arrays.asList(md5lod200));
executeTest("testLod200", spec2);
String[] md5cleanedOnly = {"6b03a934c65a4964d29c147de2f89144", "86778f92b0fa6aa7c26e651c8c1eb320"};
WalkerTestSpec spec3 = new WalkerTestSpec(
"-T IntervalCleaner -LOD 5 -cleanedOnly -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L " + validationDataLocation + "cleaner.test.intervals --OutputCleaned %s -snps %s",
2,
Arrays.asList(md5cleanedOnly));
executeTest("testCleanedOnly", spec3);
}
}

View File

@ -5,7 +5,7 @@ import org.junit.Test;
import java.util.Arrays;
public class IntervalsIntegrationTest extends WalkerTest {
public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {