From dde52e33eb7523e147d2df0eb60b702643317f40 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 19 Jun 2009 22:04:47 +0000 Subject: [PATCH] Cleanup of the cleaned read injector based on Eric's feedback. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1062 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/traversals/TraversalEngine.java | 11 ++++++++ .../traversals/TraverseByLocusWindows.java | 26 +++++++++++++++--- .../walkers/indels/CleanedReadInjector.java | 25 ++++++++++++++--- .../indels/CleanedReadInjectorTest.java | 27 ++++++++++--------- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 8a162f6a7..def577bb7 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -7,6 +7,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader.ValidationStringency; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.iterators.*; @@ -420,6 +421,16 @@ public abstract class TraversalEngine { final SAMFileHeader header = samReader.getFileHeader(); logger.debug(String.format("Sort order is: " + header.getSortOrder())); + // Kludge filename into sam file header. + if (samReader.getFileHeader().getReadGroups().size() < 1) { + //logger.warn("Setting header in reader " + f.getName()); + SAMReadGroupRecord rec = new SAMReadGroupRecord(samFile.getName()); + rec.setLibrary(samFile.getName()); + rec.setSample(samFile.getName()); + + samReader.getFileHeader().addReadGroup(rec); + } + return samReader; } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java index d41cfb2ac..f38c58eb8 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java @@ -3,10 +3,12 @@ package org.broadinstitute.sting.gatk.traversals; import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; @@ -14,7 +16,10 @@ import java.util.*; import java.io.File; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileHeader; import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.sam.SamFileHeaderMerger; /** * Created by IntelliJ IDEA. @@ -86,9 +91,7 @@ public class TraverseByLocusWindows extends TraversalEngine { for ( GenomeLoc interval : locations ) { logger.debug(String.format("Processing interval %s", interval.toString())); - CloseableIterator readIter = samReader.queryOverlapping( interval.getContig(), - (int)interval.getStart(), - (int)interval.getStop()); + CloseableIterator readIter = getIteratorOverDesiredRegion( samReader, interval ); Iterator wrappedIter = wrapReadsIterator(readIter, false); LocusContext locus = getLocusContext(wrappedIter, interval); readIter.close(); @@ -114,7 +117,7 @@ public class TraverseByLocusWindows extends TraversalEngine { GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null); int locationsIndex = 0; ArrayList intervalReads = new ArrayList(); - Iterator readIter = samReader.iterator(); + Iterator readIter = getIteratorOverDesiredRegion(samReader,null); while (readIter.hasNext()) { TraversalStatistics.nRecords++; @@ -306,4 +309,19 @@ public class TraverseByLocusWindows extends TraversalEngine { //printProgress("intervals", interval.getLocation()); return sum; } + + /** + * Gets an iterator over the specified region. Uses a special iterator that dynamically adds a header to all + * read information. + * @param reader SAMFileReader to query. + * @param region Region to use. + * @return An iterator over the desired region. + */ + private CloseableIterator getIteratorOverDesiredRegion( SAMFileReader reader, GenomeLoc region ) { + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger( Collections.singletonList(reader), SAMFileHeader.SortOrder.coordinate ); + MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2( headerMerger, new Reads(readsFiles) ); + if( region != null ) + iterator.queryOverlapping( region.getContig(), (int)region.getStart(), (int)region.getStop() ); + return iterator; + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java index 9469f3f13..791aeafce 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjector.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileWriter; @@ -14,7 +15,8 @@ import java.util.Map; import java.util.HashMap; import java.util.Queue; import java.util.LinkedList; -import java.io.File; +import java.util.ArrayList; +import java.util.List; /** * User: hanna * Date: Jun 10, 2009 @@ -74,7 +76,7 @@ public class CleanedReadInjector extends ReadWalker { @Override public void initialize() { - intervals = new LinkedList( GenomeAnalysisEngine.parseIntervalRegion(intervalsSource,false) ); + intervals = parseIntervals( intervalsSource ); interval = intervals.remove(); loadCleanedReadsOverlappingInterval( interval ); @@ -144,6 +146,21 @@ public class CleanedReadInjector extends ReadWalker { outputBAM.close(); } + /** + * Load the intervals directly from the command-line or from file, as appropriate. + * Merge overlapping intervals. + * @param intervalsSource Source of intervals. + * @return a queue of sorted, merged intervals. + */ + private Queue parseIntervals( String intervalsSource ) { + List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource,false); + GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet(); + for( GenomeLoc parsedInterval: parsedIntervals ) + intervalSortedSet.addRegion(parsedInterval); + + return new LinkedList( intervalSortedSet ); + } + /** * Load a list of all the reads overlapping the given interval into memory. * @param interval @@ -153,7 +170,7 @@ public class CleanedReadInjector extends ReadWalker { // Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it. while( overlappingReads.hasNext() ) { SAMRecord read = overlappingReads.next(); - cleanedReads.put( read.getReadName(), read ); + cleanedReads.put( getUniquifiedReadName(read), read ); } overlappingReads.close(); } @@ -163,7 +180,7 @@ public class CleanedReadInjector extends ReadWalker { * @param read read to uniquify. * @return A (hopefully) completely unique name for the read. */ - static String getUniquifiedReadName( SAMRecord read ) { + private static String getUniquifiedReadName( SAMRecord read ) { return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString()); } } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java index faab0d1ef..0a3129a8e 100644 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/indels/CleanedReadInjectorTest.java @@ -13,6 +13,7 @@ import org.junit.Test; import java.io.FileNotFoundException; import java.io.File; +import java.util.Arrays; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; @@ -39,7 +40,7 @@ public class CleanedReadInjectorTest extends BaseTest { * The fasta, for comparison. */ protected static IndexedFastaSequenceFile sequenceFile = null; - + /** * Initialize the fasta. */ @@ -83,8 +84,7 @@ public class CleanedReadInjectorTest extends BaseTest { SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); - cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead)); - cleanedRead.setReadString("AAAAA"); + cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength())); ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); @@ -103,8 +103,7 @@ public class CleanedReadInjectorTest extends BaseTest { SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); - cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead)); - cleanedRead.setReadString("AAAAA"); + cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength())); ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); @@ -127,8 +126,7 @@ public class CleanedReadInjectorTest extends BaseTest { ArtificialSAMUtils.createArtificialRead(header,"read5",1,5,5) }; SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,1); - cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[0])); - cleanedRead.setReadString("AAAAA"); + cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength())); ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); @@ -154,8 +152,7 @@ public class CleanedReadInjectorTest extends BaseTest { ArtificialSAMUtils.createArtificialRead(header,"read3",1,3,5) }; SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,3,1); - cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[0])); - cleanedRead.setReadString("AAAAA"); + cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength())); ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); @@ -176,8 +173,7 @@ public class CleanedReadInjectorTest extends BaseTest { SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5); - cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead)); - cleanedRead.setReadString("AAAAA"); + cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength())); ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead); ArtificialSAMFileWriter output = new ArtificialSAMFileWriter(); @@ -202,8 +198,7 @@ public class CleanedReadInjectorTest extends BaseTest { for( int i = 0; i < sourceReads.length; i++ ) { try { cleanedReads[i] = (SAMRecord)sourceReads[i].clone(); - cleanedReads[i].setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[i])); - cleanedReads[i].setReadString("AAAAA"); + cleanedReads[i].setBaseQualities(getMockBaseQualityString((byte)1,cleanedReads[i].getReadLength())); } catch( CloneNotSupportedException ex ) { throw new StingException("Unable to clone samrecord", ex); @@ -252,4 +247,10 @@ public class CleanedReadInjectorTest extends BaseTest { return accum; } + private byte[] getMockBaseQualityString( byte value, int length ) { + byte[] baseQualities = new byte[length]; + Arrays.fill(baseQualities,value); + return baseQualities; + } + }