Cleanup of the cleaned read injector based on Eric's feedback.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1062 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-06-19 22:04:47 +00:00
parent a0a3cf2f9f
commit dde52e33eb
4 changed files with 68 additions and 21 deletions

View File

@ -7,6 +7,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.*;
@ -420,6 +421,16 @@ public abstract class TraversalEngine {
final SAMFileHeader header = samReader.getFileHeader();
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
// Kludge filename into sam file header.
if (samReader.getFileHeader().getReadGroups().size() < 1) {
//logger.warn("Setting header in reader " + f.getName());
SAMReadGroupRecord rec = new SAMReadGroupRecord(samFile.getName());
rec.setLibrary(samFile.getName());
rec.setSample(samFile.getName());
samReader.getFileHeader().addReadGroup(rec);
}
return samReader;
}
}

View File

@ -3,10 +3,12 @@ package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
@ -14,7 +16,10 @@ import java.util.*;
import java.io.File;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.sam.SamFileHeaderMerger;
/**
* Created by IntelliJ IDEA.
@ -86,9 +91,7 @@ public class TraverseByLocusWindows extends TraversalEngine {
for ( GenomeLoc interval : locations ) {
logger.debug(String.format("Processing interval %s", interval.toString()));
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
(int)interval.getStart(),
(int)interval.getStop());
CloseableIterator<SAMRecord> readIter = getIteratorOverDesiredRegion( samReader, interval );
Iterator<SAMRecord> wrappedIter = wrapReadsIterator(readIter, false);
LocusContext locus = getLocusContext(wrappedIter, interval);
readIter.close();
@ -114,7 +117,7 @@ public class TraverseByLocusWindows extends TraversalEngine {
GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null);
int locationsIndex = 0;
ArrayList<SAMRecord> intervalReads = new ArrayList<SAMRecord>();
Iterator<SAMRecord> readIter = samReader.iterator();
Iterator<SAMRecord> readIter = getIteratorOverDesiredRegion(samReader,null);
while (readIter.hasNext()) {
TraversalStatistics.nRecords++;
@ -306,4 +309,19 @@ public class TraverseByLocusWindows extends TraversalEngine {
//printProgress("intervals", interval.getLocation());
return sum;
}
/**
* Gets an iterator over the specified region. Uses a special iterator that dynamically adds a header to all
* read information.
* @param reader SAMFileReader to query.
* @param region Region to use.
* @return An iterator over the desired region.
*/
private CloseableIterator<SAMRecord> getIteratorOverDesiredRegion( SAMFileReader reader, GenomeLoc region ) {
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger( Collections.singletonList(reader), SAMFileHeader.SortOrder.coordinate );
MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2( headerMerger, new Reads(readsFiles) );
if( region != null )
iterator.queryOverlapping( region.getContig(), (int)region.getStart(), (int)region.getStop() );
return iterator;
}
}

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
@ -14,7 +15,8 @@ import java.util.Map;
import java.util.HashMap;
import java.util.Queue;
import java.util.LinkedList;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* User: hanna
* Date: Jun 10, 2009
@ -74,7 +76,7 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
@Override
public void initialize() {
intervals = new LinkedList<GenomeLoc>( GenomeAnalysisEngine.parseIntervalRegion(intervalsSource,false) );
intervals = parseIntervals( intervalsSource );
interval = intervals.remove();
loadCleanedReadsOverlappingInterval( interval );
@ -144,6 +146,21 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
outputBAM.close();
}
/**
* Load the intervals directly from the command-line or from file, as appropriate.
* Merge overlapping intervals.
* @param intervalsSource Source of intervals.
* @return a queue of sorted, merged intervals.
*/
private Queue parseIntervals( String intervalsSource ) {
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource,false);
GenomeLocSortedSet intervalSortedSet = new GenomeLocSortedSet();
for( GenomeLoc parsedInterval: parsedIntervals )
intervalSortedSet.addRegion(parsedInterval);
return new LinkedList<GenomeLoc>( intervalSortedSet );
}
/**
* Load a list of all the reads overlapping the given interval into memory.
* @param interval
@ -153,7 +170,7 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
// Load in all reads mapped to this region. The cleaner will augment the read name in a way that uniquifies it.
while( overlappingReads.hasNext() ) {
SAMRecord read = overlappingReads.next();
cleanedReads.put( read.getReadName(), read );
cleanedReads.put( getUniquifiedReadName(read), read );
}
overlappingReads.close();
}
@ -163,7 +180,7 @@ public class CleanedReadInjector extends ReadWalker<Integer,Integer> {
* @param read read to uniquify.
* @return A (hopefully) completely unique name for the read.
*/
static String getUniquifiedReadName( SAMRecord read ) {
private static String getUniquifiedReadName( SAMRecord read ) {
return String.format("%s.%s.%s.%s",read.getAttribute("RG"),read.getReadName(),read.getFlags(),read.getReadString());
}
}

View File

@ -13,6 +13,7 @@ import org.junit.Test;
import java.io.FileNotFoundException;
import java.io.File;
import java.util.Arrays;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
@ -39,7 +40,7 @@ public class CleanedReadInjectorTest extends BaseTest {
* The fasta, for comparison.
*/
protected static IndexedFastaSequenceFile sequenceFile = null;
/**
* Initialize the fasta.
*/
@ -83,8 +84,7 @@ public class CleanedReadInjectorTest extends BaseTest {
SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead));
cleanedRead.setReadString("AAAAA");
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
@ -103,8 +103,7 @@ public class CleanedReadInjectorTest extends BaseTest {
SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead));
cleanedRead.setReadString("AAAAA");
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
@ -127,8 +126,7 @@ public class CleanedReadInjectorTest extends BaseTest {
ArtificialSAMUtils.createArtificialRead(header,"read5",1,5,5) };
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,1);
cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[0]));
cleanedRead.setReadString("AAAAA");
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
@ -154,8 +152,7 @@ public class CleanedReadInjectorTest extends BaseTest {
ArtificialSAMUtils.createArtificialRead(header,"read3",1,3,5) };
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,3,1);
cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[0]));
cleanedRead.setReadString("AAAAA");
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
@ -176,8 +173,7 @@ public class CleanedReadInjectorTest extends BaseTest {
SAMRecord sourceRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
SAMRecord cleanedRead = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1,5);
cleanedRead.setReadName(CleanedReadInjector.getUniquifiedReadName(sourceRead));
cleanedRead.setReadString("AAAAA");
cleanedRead.setBaseQualities(getMockBaseQualityString((byte)1,cleanedRead.getReadLength()));
ArtificialSAMFileReader cleanedReads = new ArtificialSAMFileReader(cleanedRead);
ArtificialSAMFileWriter output = new ArtificialSAMFileWriter();
@ -202,8 +198,7 @@ public class CleanedReadInjectorTest extends BaseTest {
for( int i = 0; i < sourceReads.length; i++ ) {
try {
cleanedReads[i] = (SAMRecord)sourceReads[i].clone();
cleanedReads[i].setReadName(CleanedReadInjector.getUniquifiedReadName(sourceReads[i]));
cleanedReads[i].setReadString("AAAAA");
cleanedReads[i].setBaseQualities(getMockBaseQualityString((byte)1,cleanedReads[i].getReadLength()));
}
catch( CloneNotSupportedException ex ) {
throw new StingException("Unable to clone samrecord", ex);
@ -252,4 +247,10 @@ public class CleanedReadInjectorTest extends BaseTest {
return accum;
}
private byte[] getMockBaseQualityString( byte value, int length ) {
byte[] baseQualities = new byte[length];
Arrays.fill(baseQualities,value);
return baseQualities;
}
}