LocusIteratorByState operates natively on GATKSAMRecords now

-- Updated code to reflect this new typing
This commit is contained in:
Mark DePristo 2013-01-11 14:01:02 -05:00
parent 94cb50d3d6
commit e88dae2758
14 changed files with 339 additions and 198 deletions

View File

@ -29,12 +29,14 @@ import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.iterators.GATKSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.locusiterator.LocusIterator;
import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Collection;
import java.util.Iterator;
@ -70,7 +72,7 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
/**
* Hold the read iterator so that it can be closed later.
*/
private final StingSAMIterator readIterator;
private final GATKSAMIterator readIterator;
/**
* The data source for reads. Will probably come directly from the BAM file.
@ -107,12 +109,12 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List<GenomeLoc> intervals, Collection<String> sampleNames) {
this.sourceInfo = shard.getReadProperties();
this.readIterator = iterator;
this.readIterator = new GATKSAMIterator(iterator);
// Use the legacy version of LocusIteratorByState if legacy downsampling was requested:
if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler )
throw new IllegalArgumentException("legacy downsampler no longer supported in the window maker");
this.libs = new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames);
this.libs = new LocusIteratorByState(readIterator,sourceInfo,genomeLocParser,sampleNames);
this.sourceIterator = new PeekableIterator<AlignmentContext>(libs);
this.intervalIterator = intervals.size()>0 ? new PeekableIterator<GenomeLoc>(intervals.iterator()) : null;

View File

@ -0,0 +1,57 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Iterator;
/**
* Temporarily hack to convert SAMRecords to GATKSAMRecords
*
* User: depristo
* Date: 1/11/13
* Time: 1:19 PM
*/
public class GATKSAMIterator implements CloseableIterator<GATKSAMRecord>, Iterable<GATKSAMRecord> {
final CloseableIterator<SAMRecord> it;
public GATKSAMIterator(final CloseableIterator<SAMRecord> it) {
this.it = it;
}
public GATKSAMIterator(final StingSAMIterator it) {
this.it = it;
}
@Override public boolean hasNext() { return it.hasNext(); }
@Override public GATKSAMRecord next() { return (GATKSAMRecord)it.next(); }
@Override public void remove() { it.remove(); }
@Override public void close() { it.close(); }
@Override public Iterator<GATKSAMRecord> iterator() { return this; }
}

View File

@ -84,9 +84,9 @@ public class TraverseActiveRegionsOptimized<M,T> extends TraverseActiveRegions<M
// Grab all the previously unseen reads from this pileup and add them to the massive read list
// Note that this must occur before we leave because we are outside the intervals because
// reads may occur outside our intervals but overlap them in the future
final Collection<SAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final SAMRecord read : reads ) {
notifyOfCurrentPosition((GATKSAMRecord)read);
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final GATKSAMRecord read : reads ) {
notifyOfCurrentPosition(read);
// most of the time maybeDuplicatedReads is empty
// TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
// TODO -- potential list of duplicates we can clear the hashset

View File

@ -31,7 +31,6 @@ import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -87,8 +86,8 @@ public class AlignmentStateMachine {
private int offsetIntoCurrentCigarElement;
@Requires({"read != null", "read.getAlignmentStart() != -1", "read.getCigar() != null"})
public AlignmentStateMachine(final SAMRecord read) {
this.read = (GATKSAMRecord)read;
public AlignmentStateMachine(final GATKSAMRecord read) {
this.read = read;
this.cigar = read.getCigar();
this.nCigarElements = cigar.numCigarElements();
initializeAsLeftEdge();
@ -110,7 +109,7 @@ public class AlignmentStateMachine {
* @return a non-null GATKSAMRecord
*/
@Ensures("result != null")
public SAMRecord getRead() {
public GATKSAMRecord getRead() {
return read;
}

View File

@ -0,0 +1,193 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecordIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.GATKSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
/**
* Caliper microbenchmark of fragment pileup
*/
public class LIBSPerformance extends CommandLineProgram {
private static Logger logger = Logger.getLogger(LIBSPerformance.class);
@Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true)
public File samFile = null;
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = true)
public File referenceFile = null;
@Argument(fullName = "L", shortName = "L", doc = "Query location", required = false)
public String location = null;
@Override
public int execute() throws IOException {
final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile);
final GenomeLocParser genomeLocParser = new GenomeLocParser(reference);
final SAMFileReader reader = new SAMFileReader(samFile);
reader.setSAMRecordFactory(new GATKSamRecordFactory());
SAMRecordIterator rawIterator;
if ( location == null )
rawIterator = reader.iterator();
else {
final GenomeLoc loc = genomeLocParser.parseGenomeLoc(location);
rawIterator = reader.query(loc.getContig(), loc.getStart(), loc.getStop(), false);
}
final GATKSAMIterator iterator = new GATKSAMIterator(rawIterator);
final Set<String> samples = new HashSet<String>();
for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() )
samples.add(rg.getSample());
final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(false, -1);
final LocusIteratorByState libs =
new LocusIteratorByState(
iterator,
ds,
true,
genomeLocParser,
samples,
false);
int bp = 0;
while ( libs.hasNext() ) {
AlignmentContext context = libs.next();
if ( ++bp % 100000 == 0 )
logger.info(bp + " iterations at " + context.getLocation());
}
return 0;
}
// private void syntheticTests() {
// final int readLength = 101;
// final int nReads = 10000;
// final int locus = 1;
//
// SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
// final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
//
// int nIterations = 0;
// for ( final String cigar : Arrays.asList("101M", "50M10I40M", "50M10D40M") ) {
// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength);
// read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
// final byte[] quals = new byte[readLength];
// for ( int i = 0; i < readLength; i++ )
// quals[i] = (byte)(i % QualityUtils.MAX_QUAL_SCORE);
// read.setBaseQualities(quals);
// read.setCigarString(cigar);
//
// for ( int j = 0; j < nReads; j++ ) {
// for ( int i = 0; i < rep; i++ ) {
// switch ( op ) {
// case NEW_STATE:
// {
// final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read);
// while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
// nIterations++;
// }
// }
// break;
//// case OLD_STATE:
//// {
//// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read);
//// while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
//// alignmentStateMachine.getRead();
//// nIterations++;
//// }
//// }
//// break;
// case NEW_LIBS:
// {
// final List<GATKSAMRecord> reads = Collections.nCopies(30, read);
// final org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState libs =
// new org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState(
// new LocusIteratorByStateBaseTest.FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
// LocusIteratorByStateBaseTest.createTestReadProperties(),
// genomeLocParser,
// LocusIteratorByState.sampleListForSAMWithoutReadGroups());
//
// while ( libs.hasNext() ) {
// AlignmentContext context = libs.next();
// }
// }
// }
// }
// }
// }
//
// System.out.printf("iterations %d%n", nIterations);
// }
/**
* Required main method implementation.
* @param argv Command-line argument text.
* @throws Exception on error.
*/
public static void main(String[] argv) throws Exception {
int returnCode = 0;
try {
LIBSPerformance instance = new LIBSPerformance();
start(instance, argv);
returnCode = 0;
} catch(Exception ex) {
returnCode = 1;
ex.printStackTrace();
throw ex;
} finally {
System.exit(returnCode);
}
}
}

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.locusiterator;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -51,7 +50,7 @@ import java.util.*;
*
* There are a few constraints on required and ensured by LIBS:
*
* -- Requires the Iterator<SAMRecord> to returns reads in coordinate sorted order, consistent with the ordering
* -- Requires the Iterator<GATKSAMRecord> to returns reads in coordinate sorted order, consistent with the ordering
* defined by the SAM file format. That that for performance reasons this constraint isn't actually enforced.
* The behavior of LIBS is undefined in the case where the reads are badly ordered.
* -- The reads in the ReadBackedPileup are themselves in the order of appearance of the reads from the iterator.
@ -126,7 +125,7 @@ public class LocusIteratorByState extends LocusIterator {
* list of samples may contain a null element, and all reads without read groups will
* be mapped to this null sample
*/
public LocusIteratorByState(final Iterator<SAMRecord> samIterator,
public LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
final ReadProperties readInformation,
final GenomeLocParser genomeLocParser,
final Collection<String> samples) {
@ -151,7 +150,7 @@ public class LocusIteratorByState extends LocusIterator {
* be mapped to this null sample
* @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them
* available via the transferReadsFromAllPreviousPileups interface
*/ protected LocusIteratorByState(final Iterator<SAMRecord> samIterator,
*/ protected LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
final LIBSDownsamplingInfo downsamplingInfo,
final boolean includeReadsWithDeletionAtLoci,
final GenomeLocParser genomeLocParser,
@ -310,7 +309,7 @@ public class LocusIteratorByState extends LocusIterator {
* of submitted reads, if enabled.
*
* The purpose of this function is allow users of LIBS to keep track of all of the reads pulled off the
* underlying SAMRecord iterator and that appeared at any point in the list of SAMRecordAlignmentState for
* underlying GATKSAMRecord iterator and that appeared at any point in the list of SAMRecordAlignmentState for
* any reads. This function is intended to allow users to efficiently reconstruct the unique set of reads
* used across all pileups. This is necessary for LIBS to handle because attempting to do
* so from the pileups coming out of LIBS is extremely expensive.
@ -322,7 +321,7 @@ public class LocusIteratorByState extends LocusIterator {
* @return the current list
*/
@Ensures("result != null")
public List<SAMRecord> transferReadsFromAllPreviousPileups() {
public List<GATKSAMRecord> transferReadsFromAllPreviousPileups() {
return readStates.transferSubmittedReads();
}
@ -331,7 +330,7 @@ public class LocusIteratorByState extends LocusIterator {
* @return a non-null list
*/
@Ensures("result != null")
protected List<SAMRecord> getReadsFromAllPreviousPileups() {
protected List<GATKSAMRecord> getReadsFromAllPreviousPileups() {
return readStates.getSubmittedReads();
}

View File

@ -28,9 +28,9 @@ package org.broadinstitute.sting.utils.locusiterator;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
@ -50,30 +50,30 @@ import java.util.*;
*/
class ReadStateManager {
private final List<String> samples;
private final PeekableIterator<SAMRecord> iterator;
private final SamplePartitioner samplePartitioner;
private final PeekableIterator<GATKSAMRecord> iterator;
private final SamplePartitioner<GATKSAMRecord> samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
private LinkedList<SAMRecord> submittedReads;
private LinkedList<GATKSAMRecord> submittedReads;
private final boolean keepSubmittedReads;
private int totalReadStates = 0;
public ReadStateManager(final Iterator<SAMRecord> source,
public ReadStateManager(final Iterator<GATKSAMRecord> source,
final List<String> samples,
final LIBSDownsamplingInfo LIBSDownsamplingInfo,
final boolean keepSubmittedReads) {
this.samples = samples;
this.iterator = new PeekableIterator<SAMRecord>(source);
this.iterator = new PeekableIterator<GATKSAMRecord>(source);
this.keepSubmittedReads = keepSubmittedReads;
this.submittedReads = new LinkedList<SAMRecord>();
this.submittedReads = new LinkedList<GATKSAMRecord>();
for (final String sample : samples) {
readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
}
samplePartitioner = new SamplePartitioner(LIBSDownsamplingInfo, samples);
samplePartitioner = new SamplePartitioner<GATKSAMRecord>(LIBSDownsamplingInfo, samples);
}
/**
@ -138,12 +138,12 @@ class ReadStateManager {
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
private boolean readIsPastCurrentPosition(GATKSAMRecord read) {
if (isEmpty())
return false;
else {
AlignmentStateMachine state = getFirst();
SAMRecord ourRead = state.getRead();
final AlignmentStateMachine state = getFirst();
final GATKSAMRecord ourRead = state.getRead();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
@ -172,7 +172,7 @@ class ReadStateManager {
samplePartitioner.doneSubmittingReads();
for (final String sample : samples) {
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
final Collection<GATKSAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
addReadsToSample(statesBySample, newReads);
}
@ -185,7 +185,7 @@ class ReadStateManager {
* @param read a non-null read
*/
@Requires("read != null")
protected void submitRead(final SAMRecord read) {
protected void submitRead(final GATKSAMRecord read) {
if ( keepSubmittedReads )
submittedReads.add(read);
samplePartitioner.submitRead(read);
@ -213,11 +213,11 @@ class ReadStateManager {
"result != null",
"result != submittedReads" // result and previous submitted reads are not == objects
})
public List<SAMRecord> transferSubmittedReads() {
public List<GATKSAMRecord> transferSubmittedReads() {
if ( ! keepSubmittedReads ) throw new UnsupportedOperationException("cannot transferSubmittedReads if you aren't keeping them");
final List<SAMRecord> prevSubmittedReads = submittedReads;
this.submittedReads = new LinkedList<SAMRecord>();
final List<GATKSAMRecord> prevSubmittedReads = submittedReads;
this.submittedReads = new LinkedList<GATKSAMRecord>();
return prevSubmittedReads;
}
@ -244,7 +244,7 @@ class ReadStateManager {
* @return a non-null list of reads that have been submitted to this ReadStateManager
*/
@Ensures({"result != null","keepSubmittedReads || result.isEmpty()"})
protected List<SAMRecord> getSubmittedReads() {
protected List<GATKSAMRecord> getSubmittedReads() {
return submittedReads;
}
@ -254,13 +254,13 @@ class ReadStateManager {
* @param readStates The list of read states to add this collection of reads.
* @param reads Reads to add. Selected reads will be pulled from this source.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<GATKSAMRecord> reads) {
if (reads.isEmpty())
return;
Collection<AlignmentStateMachine> newReadStates = new LinkedList<AlignmentStateMachine>();
for (SAMRecord read : reads) {
for (GATKSAMRecord read : reads) {
AlignmentStateMachine state = new AlignmentStateMachine(read);
if ( state.stepForwardOnGenome() != null )
// explicitly filter out reads that are all insertions / soft clips

View File

@ -37,35 +37,35 @@ import java.util.*;
*
* Note: stores reads by sample ID string, not by sample object
*/
class SamplePartitioner {
private Map<String, Downsampler<SAMRecord>> readsBySample;
class SamplePartitioner<T extends SAMRecord> {
private Map<String, Downsampler<T>> readsBySample;
public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
readsBySample = new HashMap<String, Downsampler<SAMRecord>>(samples.size());
readsBySample = new HashMap<String, Downsampler<T>>(samples.size());
for ( String sample : samples ) {
readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo));
}
}
private Downsampler<SAMRecord> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
private Downsampler<T> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
return LIBSDownsamplingInfo.isPerformDownsampling()
? new ReservoirDownsampler<SAMRecord>(LIBSDownsamplingInfo.getToCoverage())
: new PassThroughDownsampler<SAMRecord>();
? new ReservoirDownsampler<T>(LIBSDownsamplingInfo.getToCoverage())
: new PassThroughDownsampler<T>();
}
public void submitRead(SAMRecord read) {
public void submitRead(T read) {
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).submit(read);
}
public void doneSubmittingReads() {
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
for ( Map.Entry<String, Downsampler<T>> perSampleReads : readsBySample.entrySet() ) {
perSampleReads.getValue().signalEndOfInput();
}
}
public Collection<SAMRecord> getReadsForSample(String sampleName) {
public Collection<T> getReadsForSample(String sampleName) {
if ( ! readsBySample.containsKey(sampleName) )
throw new NoSuchElementException("Sample name not found");
@ -73,7 +73,7 @@ class SamplePartitioner {
}
public void reset() {
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
for ( Map.Entry<String, Downsampler<T>> perSampleReads : readsBySample.entrySet() ) {
perSampleReads.getValue().clear();
perSampleReads.getValue().reset();
}

View File

@ -335,13 +335,13 @@ public class ArtificialSAMUtils {
*
* @return a collection of stackSize reads all sharing the above properties
*/
public static List<SAMRecord> createReadStream( final int nReadsPerLocus,
public static List<GATKSAMRecord> createReadStream( final int nReadsPerLocus,
final int nLoci,
final SAMFileHeader header,
final int alignmentStart,
final int length ) {
final String baseName = "read";
List<SAMRecord> reads = new ArrayList<SAMRecord>(nReadsPerLocus*nLoci);
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(nReadsPerLocus*nLoci);
for ( int locus = 0; locus < nLoci; locus++ ) {
for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {

View File

@ -1,110 +0,0 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Caliper microbenchmark of fragment pileup
*/
public class AlignmentStateMachinePerformance {
final static int readLength = 101;
final static int nReads = 10000;
final static int locus = 1;
private enum Op {
NEW_STATE, OLD_STATE, NEW_LIBS
}
public static void main(String[] args) {
final int rep = Integer.valueOf(args[0]);
final Op op = Op.valueOf(args[1]);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
int nIterations = 0;
for ( final String cigar : Arrays.asList("101M", "50M10I40M", "50M10D40M") ) {
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
final byte[] quals = new byte[readLength];
for ( int i = 0; i < readLength; i++ )
quals[i] = (byte)(i % QualityUtils.MAX_QUAL_SCORE);
read.setBaseQualities(quals);
read.setCigarString(cigar);
for ( int j = 0; j < nReads; j++ ) {
for ( int i = 0; i < rep; i++ ) {
switch ( op ) {
case NEW_STATE:
{
final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read);
while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
nIterations++;
}
}
break;
// case OLD_STATE:
// {
// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read);
// while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
// alignmentStateMachine.getRead();
// nIterations++;
// }
// }
// break;
case NEW_LIBS:
{
final List<SAMRecord> reads = Collections.nCopies(30, (SAMRecord) read);
final org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState libs =
new org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState(
new LocusIteratorByStateBaseTest.FakeCloseableIterator<SAMRecord>(reads.iterator()),
LocusIteratorByStateBaseTest.createTestReadProperties(),
genomeLocParser,
LocusIteratorByState.sampleListForSAMWithoutReadGroups());
while ( libs.hasNext() ) {
AlignmentContext context = libs.next();
}
}
}
}
}
}
System.out.printf("iterations %d%n", nIterations);
}
}

View File

@ -46,7 +46,7 @@ public class LocusIteratorBenchmark extends SimpleBenchmark {
protected SAMFileHeader header;
protected GenomeLocParser genomeLocParser;
List<SAMRecord> reads = new LinkedList<SAMRecord>();
List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
final int readLength = 101;
final int nReads = 10000;
final int locus = 1;
@ -104,7 +104,7 @@ public class LocusIteratorBenchmark extends SimpleBenchmark {
for ( int i = 0; i < rep; i++ ) {
final org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState libs =
new org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState(
new LocusIteratorByStateBaseTest.FakeCloseableIterator<SAMRecord>(reads.iterator()),
new LocusIteratorByStateBaseTest.FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
LocusIteratorByStateBaseTest.createTestReadProperties(),
genomeLocParser,
LocusIteratorByState.sampleListForSAMWithoutReadGroups());
@ -128,7 +128,7 @@ public class LocusIteratorBenchmark extends SimpleBenchmark {
public void timeAlignmentStateMachine(int rep) {
for ( int i = 0; i < rep; i++ ) {
for ( final SAMRecord read : reads ) {
for ( final GATKSAMRecord read : reads ) {
final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read);
while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
;

View File

@ -57,9 +57,9 @@ public class LocusIteratorByStateBaseTest extends BaseTest {
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
protected LocusIteratorByState makeLTBS(List<SAMRecord> reads,
protected LocusIteratorByState makeLTBS(List<GATKSAMRecord> reads,
ReadProperties readAttributes) {
return new LocusIteratorByState(new FakeCloseableIterator<SAMRecord>(reads.iterator()),
return new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
readAttributes,
genomeLocParser,
LocusIteratorByState.sampleListForSAMWithoutReadGroups());
@ -85,7 +85,7 @@ public class LocusIteratorByStateBaseTest extends BaseTest {
keepReads);
}
protected static class FakeCloseableIterator<T> implements CloseableIterator<T> {
public static class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {

View File

@ -61,27 +61,27 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10);
GATKSAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10);
r1.setReadBases(bases1);
r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r1.setCigarString("10M");
SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10);
GATKSAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10);
r2.setReadBases(bases2);
r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r2.setCigarString("3=1X5=1X");
SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10);
GATKSAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10);
r3.setReadBases(bases2);
r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r3.setCigarString("3=1X5M1X");
SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10);
GATKSAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10);
r4.setReadBases(bases2);
r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r4.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(r1, r2, r3, r4);
List<GATKSAMRecord> reads = Arrays.asList(r1, r2, r3, r4);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
@ -101,22 +101,22 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10);
GATKSAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10);
before.setReadBases(bases);
before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
before.setCigarString("10M");
SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10);
GATKSAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10);
during.setReadBases(indelBases);
during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
during.setCigarString("4M2I6M");
SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10);
GATKSAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10);
after.setReadBases(bases);
after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
after.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(before, during, after);
List<GATKSAMRecord> reads = Arrays.asList(before, during, after);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
@ -146,12 +146,12 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76);
GATKSAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76));
indelOnlyRead.setCigarString("76I");
List<SAMRecord> reads = Arrays.asList(indelOnlyRead);
List<GATKSAMRecord> reads = Arrays.asList(indelOnlyRead);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, readAttributes);
@ -174,22 +174,22 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
public void testWholeIndelRead() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76);
GATKSAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76);
leadingRead.setReadBases(Utils.dupBytes((byte)'A',76));
leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
leadingRead.setCigarString("1M75I");
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76);
GATKSAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
indelOnlyRead.setCigarString("76I");
SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76);
GATKSAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76);
fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76));
fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76));
fullMatchAfterIndel.setCigarString("75I1M");
List<SAMRecord> reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel);
List<GATKSAMRecord> reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
@ -225,12 +225,12 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
public void testWholeIndelReadRepresentedTest() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1);
GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1);
read1.setReadBases(Utils.dupBytes((byte) 'A', 1));
read1.setBaseQualities(Utils.dupBytes((byte) '@', 1));
read1.setCigarString("1I");
List<SAMRecord> reads = Arrays.asList(read1);
List<GATKSAMRecord> reads = Arrays.asList(read1);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
@ -246,7 +246,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
// Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), "A");
}
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10);
GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10);
read2.setReadBases(Utils.dupBytes((byte) 'A', 10));
read2.setBaseQualities(Utils.dupBytes((byte) '@', 10));
read2.setCigarString("10I");
@ -302,7 +302,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
@Test(enabled = true && ! DEBUG, dataProvider = "IndelLengthAndBasesTest")
public void testIndelLengthAndBasesTest(GATKSAMRecord read, final CigarOperator op, final int eventSize, final String eventBases) {
// create the iterator by state with the fake reads and fake records
li = makeLTBS(Arrays.asList((SAMRecord)read), createTestReadProperties());
li = makeLTBS(Arrays.asList((GATKSAMRecord)read), createTestReadProperties());
Assert.assertTrue(li.hasNext());
@ -354,7 +354,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
public void testLIBS(LIBSTest params) {
// create the iterator by state with the fake reads and fake records
final GATKSAMRecord read = params.makeRead();
li = makeLTBS(Arrays.asList((SAMRecord)read), createTestReadProperties());
li = makeLTBS(Arrays.asList((GATKSAMRecord)read), createTestReadProperties());
final LIBS_position tester = new LIBS_position(read);
int bpVisited = 0;
@ -458,14 +458,14 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
final DownsamplingMethod downsampler = downsample
? new DownsamplingMethod(DownsampleType.BY_SAMPLE, maxDownsampledCoverage, null, false)
: new DownsamplingMethod(DownsampleType.NONE, null, null, false);
final List<SAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
li = new LocusIteratorByState(new FakeCloseableIterator<SAMRecord>(reads.iterator()),
final List<GATKSAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(reads.iterator()),
createTestReadProperties(downsampler, keepReads),
genomeLocParser,
samples);
final Set<SAMRecord> seenSoFar = new HashSet<SAMRecord>();
final Set<SAMRecord> keptReads = new HashSet<SAMRecord>();
final Set<GATKSAMRecord> seenSoFar = new HashSet<GATKSAMRecord>();
final Set<GATKSAMRecord> keptReads = new HashSet<GATKSAMRecord>();
int bpVisited = 0;
while ( li.hasNext() ) {
bpVisited++;
@ -482,11 +482,11 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
seenSoFar.addAll(p.getReads());
if ( keepReads && grabReadsAfterEachCycle ) {
final List<SAMRecord> locusReads = li.transferReadsFromAllPreviousPileups();
final List<GATKSAMRecord> locusReads = li.transferReadsFromAllPreviousPileups();
// the number of reads starting here
int nReadsStartingHere = 0;
for ( final SAMRecord read : p.getReads() )
for ( final GATKSAMRecord read : p.getReads() )
if ( read.getAlignmentStart() == alignmentContext.getPosition() )
nReadsStartingHere++;
@ -499,7 +499,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
keptReads.addAll(locusReads);
// check that all reads we've seen so far are in our keptReads
for ( final SAMRecord read : seenSoFar ) {
for ( final GATKSAMRecord read : seenSoFar ) {
Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read);
}
}
@ -524,8 +524,8 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
// check that the order of reads is the same as in our read list
for ( int i = 0; i < reads.size(); i++ ) {
final SAMRecord inputRead = reads.get(i);
final SAMRecord keptRead = reads.get(i);
final GATKSAMRecord inputRead = reads.get(i);
final GATKSAMRecord keptRead = reads.get(i);
Assert.assertSame(keptRead, inputRead, "Input reads and kept reads differ at position " + i);
}
} else {
@ -534,13 +534,13 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
// check uniqueness
final Set<String> readNames = new HashSet<String>();
for ( final SAMRecord read : keptReads ) {
for ( final GATKSAMRecord read : keptReads ) {
Assert.assertFalse(readNames.contains(read.getReadName()), "Found duplicate reads in the kept reads");
readNames.add(read.getReadName());
}
// check that all reads we've seen are in our keptReads
for ( final SAMRecord read : seenSoFar ) {
for ( final GATKSAMRecord read : seenSoFar ) {
Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read);
}
}

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -63,7 +64,7 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
public void run() {
final List<String> samples = LocusIteratorByState.sampleListForSAMWithoutReadGroups();
final Iterator<SAMRecord> iterator = new LinkedList<SAMRecord>().iterator();
final Iterator<GATKSAMRecord> iterator = new LinkedList<GATKSAMRecord>().iterator();
ReadStateManager readStateManager = new ReadStateManager(iterator, samples, LIBSDownsamplingInfo.NO_DOWNSAMPLING, false);
ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = readStateManager.new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING);
@ -146,10 +147,10 @@ public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
int alignmentStart = 1;
for ( int readsThisStack : readCountsPerAlignmentStart ) {
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
ArrayList<GATKSAMRecord> stackReads = new ArrayList<GATKSAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
ArrayList<AlignmentStateMachine> stackRecordStates = new ArrayList<AlignmentStateMachine>();
for ( SAMRecord read : stackReads ) {
for ( GATKSAMRecord read : stackReads ) {
stackRecordStates.add(new AlignmentStateMachine(read));
}