From 71e3825fa1fb353a7489791acca176fe4ce3f71c Mon Sep 17 00:00:00 2001 From: hanna Date: Thu, 11 Jun 2009 20:18:13 +0000 Subject: [PATCH] First pass of a walker for Eric that searches through an input BAM file for unclean reads, injecting the cleaned reads in their place and outputting the composite result. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@989 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/CleanedReadInjector.java | 148 ++++++++++ .../sam/ArtificialPatternedSAMIterator.java | 8 +- .../utils/sam/ArtificialSAMFileReader.java | 90 +++++++ .../utils/sam/ArtificialSAMIterator.java | 6 +- .../utils/sam/ArtificialSAMQueryIterator.java | 2 +- .../sting/utils/sam/ArtificialSAMUtils.java | 5 +- .../simpleDataSources/SAMByReadsTest.java | 1 + .../indels/CleanedReadInjectorTest.java | 255 ++++++++++++++++++ .../sam/ArtificialSAMFileWriterTest.java | 4 +- 9 files changed, 508 insertions(+), 11 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java create mode 100644 java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java create mode 100644 java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java new file mode 100644 index 000000000..616619f53 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java @@ -0,0 +1,148 @@ +package org.broadinstitute.sting.playground.gatk.walkers.indels; + +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.GenomeLoc; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.util.CloseableIterator; + +import java.io.File; +import java.util.Map; +import java.util.HashMap; +import java.util.Queue; +import java.util.LinkedList; +/** + * User: hanna + * Date: Jun 10, 2009 + * Time: 2:40:19 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Copies reads from the input stream into the outputBAM, replacing those + * reads which have been cleaned with their new clean copies. + */ +public class CleanedReadInjector extends ReadWalker { + + /** + * The source of all cleaned intervals. + */ + @Argument(fullName="cleaned_intervals",shortName="ci",doc="Intervals which have been cleaned.",required=true) + String intervalsSource = null; + + /** + * The source of all cleaned reads. + */ + @Argument(fullName="cleaned_reads",shortName="cr",doc="BAM file of reads which have been cleaned.",required=true) + SAMFileReader cleanedReadsSource = null; + + /** + * Target file for BAM output. + */ + @Argument(fullName="output_bam",shortName="ob",doc="Output BAM file",required=true) + SAMFileWriter outputBAM = null; + + /** + * A stream of processed intervals. The current head of the queue represents the next interval. + */ + private Queue intervals; + + /** + * The interval currently in process. + */ + private GenomeLoc interval = null; + + /** + * A structure for fast lookup of all reads that have been cleaned in the current interval. + */ + private Map cleanedReads = new HashMap(); + + @Override + public void initialize() { + intervals = new LinkedList( GenomeAnalysisEngine.parseIntervalRegion(intervalsSource,false) ); + interval = intervals.remove(); + loadCleanedReadsOverlappingInterval( interval ); + } + + /** + * For every read, inspect the set of cleaned reads that could possibly replace the current one. + * If one exists, add the cleaned read to the output. Otherwise, add the current read. + * @param ref Portion of the reference sequence overlapping this read. + * @param read The read to inspect. + * @return Number of reads cleaned by this iteration of the map function. + */ + @Override + public Integer map(char[] ref, SAMRecord read) { + GenomeLoc loc = new GenomeLoc(read); + + while( loc.isPast(interval) && intervals.size() > 0 ) { + interval = intervals.remove(); + loadCleanedReadsOverlappingInterval(interval); + } + + String uniquifiedReadName = getUniquifiedReadName(read); + int numCleaned = 0; + + if( loc.overlapsP(interval) && cleanedReads.containsKey(uniquifiedReadName) ) { + SAMRecord cleanedRead = cleanedReads.get(uniquifiedReadName); + cleanedRead.setReadName(read.getReadName()); + outputBAM.addAlignment(cleanedRead); + numCleaned = 1; + } + else { + outputBAM.addAlignment(read); + numCleaned = 0; + } + + return numCleaned; + } + + /** + * Initialize traversal with number of reads which have been replaced with a clean version. + * @return 0 to initialize the traversal. + */ + @Override + public Integer reduceInit() { return 0; } + + /** + * Accumulates the number of reads which have been replaced by their clean equivalents. + * @param value Number of reads cleaned by this iteration. + * @param accum Prior number of reads cleaned. + * @return Total number of reads cleaned, including this iteration. + */ + @Override + public Integer reduce( Integer value, Integer accum ) { + return accum + value; + } + + /** + * Load a list of all the reads overlapping the given interval into memory. + * @param interval + */ + private void loadCleanedReadsOverlappingInterval( GenomeLoc interval ) { + CloseableIterator overlappingReads = cleanedReadsSource.queryOverlapping( interval.getContig(), (int)interval.getStart(), (int)interval.getStop() ); + // Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it. + while( overlappingReads.hasNext() ) { + SAMRecord read = overlappingReads.next(); + cleanedReads.put( read.getReadName(), read ); + } + } + + /** + * Gets the distinguishing elements of the read name from the given read. + * @param read read to uniquify. + * @return A (hopefully) completely unique name for the read. + */ + static String getUniquifiedReadName( SAMRecord read ) { + return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString()); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java index cbb8a1e02..c5f70d05a 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialPatternedSAMIterator.java @@ -71,7 +71,7 @@ public class ArtificialPatternedSAMIterator extends ArtificialSAMIterator { reads = new int[readCount]; for (int x = 0; x < readCount; x++) { - reads[x] = x; + reads[x] = x+1; } if (pattern == PATTERN.RANDOM_READS) { // scramble a bunch of the reads @@ -102,9 +102,9 @@ public class ArtificialPatternedSAMIterator extends ArtificialSAMIterator { * @return */ protected boolean createNextRead() { - if (currentRead >= rCount) { + if (currentRead > rCount) { currentChromo++; - currentRead = 0; + currentRead = 1; } // check for end condition, have we finished the chromosome listing, and have no unmapped reads if (currentChromo >= eChromosomeCount) { @@ -137,7 +137,7 @@ public class ArtificialPatternedSAMIterator extends ArtificialSAMIterator { if (read > this.readCount) { return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[readCount - 1]), currentChromo, reads[readCount - 1], 50); } - return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[read]), currentChromo, reads[read], 50); + return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[read-1]), currentChromo, reads[read-1], 50); } } diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java new file mode 100644 index 000000000..3b293a0fc --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java @@ -0,0 +1,90 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; + +import java.io.InputStream; +import java.io.ByteArrayInputStream; +import java.io.UnsupportedEncodingException; +import java.io.File; +import java.util.Collections; +import java.util.List; +import java.util.Arrays; +import java.util.Iterator; +import java.util.ArrayList; + +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; +import org.broadinstitute.sting.gatk.Reads; +/** + * User: hanna + * Date: Jun 11, 2009 + * Time: 9:35:31 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Pass specified reads into the given walker. + */ + +public class ArtificialSAMFileReader extends SAMFileReader { + /** + * Backing data store of reads. + */ + private List reads = null; + + /** + * Construct an artificial SAM file reader. + */ + public ArtificialSAMFileReader(SAMRecord... reads) { + super( createEmptyInputStream(),true ); + this.reads = Arrays.asList(reads); + } + + /** + * @{inheritDoc} + */ + @Override + public CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { + GenomeLoc region = new GenomeLoc(sequence, start, end); + List coveredSubset = new ArrayList(); + + for( SAMRecord read: reads ) { + GenomeLoc readPosition = new GenomeLoc(read); + if( contained && region.containsP(readPosition) ) coveredSubset.add(read); + else if( !contained && readPosition.overlapsP(region) ) coveredSubset.add(read); + } + + final Iterator iterator = coveredSubset.iterator(); + return new CloseableIterator() { + public boolean hasNext() { return iterator.hasNext(); } + public SAMRecord next() { return iterator.next(); } + public void close() {} + public void remove() { iterator.remove(); } + }; + } + + /** + * Builds an empty input stream for faking out the sam file reader. + * Derive it from a string so that, in the future, it might be possible + * to fake the text of a sam file from samtools output, et.c + * @return Stream that returns no characters. + */ + private static InputStream createEmptyInputStream() { + try { + byte[] byteArray = "".getBytes("ISO-8859-1"); + return new ByteArrayInputStream(byteArray); + } + catch( UnsupportedEncodingException ex ) { + throw new StingException("Unable to build empty input stream",ex); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java index 3254a1332..41d3cd62a 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMIterator.java @@ -38,7 +38,7 @@ public class ArtificialSAMIterator implements PeekingStingIterator { protected int currentChromo = 0; - protected int currentRead = 0; + protected int currentRead = 1; protected int totalReadCount = 0; protected boolean done = false; // the next record @@ -110,9 +110,9 @@ public class ArtificialSAMIterator implements PeekingStingIterator { } protected boolean createNextRead() { - if (currentRead >= rCount) { + if (currentRead > rCount) { currentChromo++; - currentRead = 0; + currentRead = 1; } // check for end condition, have we finished the chromosome listing, and have no unmapped reads if (currentChromo >= eChromosomeCount) { diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java index 3eb88209b..fdf7771a3 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMQueryIterator.java @@ -174,7 +174,7 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator { /** make sure we haven't been used as an iterator yet; this is to miror the MergingSamIterator2 action. */ public void ensureUntouched() { - if (this.currentChromo != 0 || this.currentRead > 0) { + if (this.currentChromo != 0 || this.currentRead > 1) { throw new UnsupportedOperationException("We've already been used as an iterator; you can't query after that"); } } diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 37320f54d..38704d138 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -7,6 +7,7 @@ import java.util.*; import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; import org.broadinstitute.sting.gatk.Reads; +import org.broadinstitute.sting.utils.StingException; /** * @@ -114,10 +115,12 @@ public class ArtificialSAMUtils { * @return the artificial read */ public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + if( alignmentStart == 0 ) + throw new StingException("Invalid alignment start for artificial read"); SAMRecord record = new SAMRecord(header); record.setReadName(name); record.setReferenceIndex(refIndex); - record.setAlignmentStart(alignmentStart + 1); + record.setAlignmentStart(alignmentStart); List elements = new ArrayList(); elements.add(new CigarElement(length, CigarOperator.characterToEnum('M'))); record.setCigar(new Cigar(elements)); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java index e73d70ec5..07d5b58d6 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMByReadsTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import static junit.framework.Assert.fail; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java new file mode 100644 index 000000000..faab0d1ef --- /dev/null +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java @@ -0,0 +1,255 @@ +package org.broadinstitute.sting.playground.gatk.walkers.indels; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.OutputTracker; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader; +import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.FileNotFoundException; +import java.io.File; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import junit.framework.Assert; +/** + * User: hanna + * Date: Jun 11, 2009 + * Time: 9:54:05 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the walker that injects clean reads into the bam file. + */ + +public class CleanedReadInjectorTest extends BaseTest { + /** + * The fasta, for comparison. + */ + protected static IndexedFastaSequenceFile sequenceFile = null; + + /** + * Initialize the fasta. + */ + @BeforeClass + public static void initialize() throws FileNotFoundException { + sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") ); + GenomeLoc.setupRefContigOrdering(sequenceFile); + } + + @Test + public void testNoReads() { + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(); + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:1-10", cleanedReads, output ); + + walker.initialize(); + walker.onTraversalDone(0); + + Assert.assertEquals("Too many records in output",0,output.getRecords().size()); + } + + @Test + public void testNoCleanedReads() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,5); + + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(); + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:1-10", cleanedReads, output ); + + int result = runWalkerOverReads(walker,sourceRead); + + Assert.assertEquals("Result of traversal is incorrect",0,result); + Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size()); + Assert.assertEquals("Output record is incorrect",sourceRead,output.getRecords().get(0)); + } + + @Test + public void testOneCleanedRead() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); + + SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); + cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead)); + cleanedRead.setReadString("AAAAA"); + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); + + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:1-10", cleanedReads, output ); + + int result = runWalkerOverReads(walker,sourceRead); + + Assert.assertEquals("Result of traversal is incorrect",1,result); + Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size()); + Assert.assertEquals("Output record is incorrect",cleanedRead,output.getRecords().get(0)); + } + + @Test + public void testPartialIntervalOverlap() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); + + SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); + cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead)); + cleanedRead.setReadString("AAAAA"); + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); + + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:4-12", cleanedReads, output ); + + int result = runWalkerOverReads(walker,sourceRead); + + Assert.assertEquals("Result of traversal is incorrect",1,result); + Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size()); + Assert.assertEquals("Output record is incorrect",cleanedRead,output.getRecords().get(0)); + } + + @Test + public void testMixedCleanedAndUncleanedNonOverlapping() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord[] sourceReads = new SAMRecord[] { ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5), + ArtificialSAMUtils.createArtificialRead(header,"read2",1,2,5), + ArtificialSAMUtils.createArtificialRead(header,"read3",1,3,5), + ArtificialSAMUtils.createArtificialRead(header,"read4",1,4,5), + ArtificialSAMUtils.createArtificialRead(header,"read5",1,5,5) }; + + SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,1); + cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[0])); + cleanedRead.setReadString("AAAAA"); + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); + + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:1-10", cleanedReads, output ); + + int result = runWalkerOverReads(walker,sourceReads); + + Assert.assertEquals("Result of traversal is incorrect",1,result); + Assert.assertEquals("Incorrect number of records in output",5,output.getRecords().size()); + for( int i = 0; i < sourceReads.length; i++ ) { + if( i == 0 ) + Assert.assertEquals("Output record is incorrect",cleanedRead,output.getRecords().get(i)); + else + Assert.assertEquals("Output record is incorrect",sourceReads[i],output.getRecords().get(i)); + } + } + + @Test + public void testAlternateReadOrder() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord[] sourceReads = new SAMRecord[] { ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5), + ArtificialSAMUtils.createArtificialRead(header,"read2",1,2,5), + ArtificialSAMUtils.createArtificialRead(header,"read3",1,3,5) }; + + SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,3,1); + cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[0])); + cleanedRead.setReadString("AAAAA"); + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); + + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:1-10", cleanedReads, output ); + + int result = runWalkerOverReads(walker,sourceReads); + + Assert.assertEquals("Result of traversal is incorrect",1,result); + Assert.assertEquals("Incorrect number of records in output",3,output.getRecords().size()); + Assert.assertEquals("Incorrect read at position 1",cleanedRead,output.getRecords().get(0)); + Assert.assertEquals("Incorrect read at position 2",sourceReads[1],output.getRecords().get(1)); + Assert.assertEquals("Incorrect read at position 3",sourceReads[2],output.getRecords().get(2)); + } + + @Test + public void testReadOutsideInterval() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); + + SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); + cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead)); + cleanedRead.setReadString("AAAAA"); + ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); + + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:20-50", cleanedReads, output ); + + int result = runWalkerOverReads( walker, sourceRead ); + + Assert.assertEquals("Result of traversal is incorrect",0,result); + Assert.assertEquals("Incorrect number of records in output",1,output.getRecords().size()); + Assert.assertEquals("Output record is incorrect",sourceRead,output.getRecords().get(0)); + } + + @Test + public void testMultipleIntervals() { + SAMFileHeader header = getMockSAMFileHeader(); + SAMRecord[] sourceReads = new SAMRecord[] { ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,10), + ArtificialSAMUtils.createArtificialRead(header,"read2",1,11,10), + ArtificialSAMUtils.createArtificialRead(header,"read3",1,21,10), + ArtificialSAMUtils.createArtificialRead(header,"read4",1,31,10), + ArtificialSAMUtils.createArtificialRead(header,"read5",1,41,10) }; + SAMRecord[] cleanedReads = new SAMRecord[sourceReads.length]; + for( int i = 0; i < sourceReads.length; i++ ) { + try { + cleanedReads[i] = (SAMRecord)sourceReads[i].clone(); + cleanedReads[i].setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[i])); + cleanedReads[i].setReadString("AAAAA"); + } + catch( CloneNotSupportedException ex ) { + throw new StingException("Unable to clone samrecord", ex); + } + } + + ArtificialSAMFileReader cleanedReader = new ArtificialSAMFileReader(cleanedReads); + + ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); + CleanedReadInjector walker = createWalker( "chr1:11-20;chr1:31-40", cleanedReader, output ); + int result = runWalkerOverReads( walker, sourceReads ); + + Assert.assertEquals("Result of traversal is incorrect",2,result); + Assert.assertEquals("Incorrect number of records in output",5,output.getRecords().size()); + Assert.assertEquals("Incorrect read at position 1",sourceReads[0],output.getRecords().get(0)); + Assert.assertEquals("Incorrect read at position 2",cleanedReads[1],output.getRecords().get(1)); + Assert.assertEquals("Incorrect read at position 3",sourceReads[2],output.getRecords().get(2)); + Assert.assertEquals("Incorrect read at position 4",cleanedReads[3],output.getRecords().get(3)); + Assert.assertEquals("Incorrect read at position 5",sourceReads[4],output.getRecords().get(4)); + } + + private CleanedReadInjector createWalker( String intervals, ArtificialSAMFileReader cleanedReads, ArtificialSAMFileWriter output ) { + CleanedReadInjector walker = new CleanedReadInjector(); + + walker.intervalsSource = intervals; + walker.cleanedReadsSource = cleanedReads; + walker.outputBAM = output; + + walker.initializeOutputStreams( new OutputTracker(null,null) ); + + return walker; + } + + private SAMFileHeader getMockSAMFileHeader() { + return ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719); + } + + private Integer runWalkerOverReads( CleanedReadInjector walker, SAMRecord... reads ) { + walker.initialize(); + Integer accum = walker.reduceInit(); + for( SAMRecord sourceRead: reads ) { + Integer value = walker.map( null, sourceRead ); + accum = walker.reduce(value,accum); + } + walker.onTraversalDone(accum); + return accum; + } + +} diff --git a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterTest.java b/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterTest.java index 006d4e12a..ef6a42472 100644 --- a/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterTest.java +++ b/java/test/org/broadinstitute/sting/utils/sam/ArtificialSAMFileWriterTest.java @@ -61,7 +61,7 @@ public class ArtificialSAMFileWriterTest extends BaseTest { @Test public void testBasicCount() { - for (int x = 0; x < 100; x++) { + for (int x = 1; x <= 100; x++) { SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 1, x, ArtificialSAMUtils.DEFAULT_READ_LENGTH); writer.addAlignment(rec); } @@ -73,7 +73,7 @@ public class ArtificialSAMFileWriterTest extends BaseTest { public void testReadName() { List names = new ArrayList(); - for (int x = 0; x < 100; x++) { + for (int x = 1; x <= 100; x++) { names.add(String.valueOf(x)); SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, String.valueOf(x), 1, x, ArtificialSAMUtils.DEFAULT_READ_LENGTH); writer.addAlignment(rec);