Support for progress tracking during parsing of SAM files.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@20 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
49a84c965e
commit
46c3f1a0ac
|
|
@ -37,6 +37,7 @@ public class TraversalEngine {
|
||||||
private ReferenceSequenceFile refFile = null;
|
private ReferenceSequenceFile refFile = null;
|
||||||
private ReferenceIterator refIter = null;
|
private ReferenceIterator refIter = null;
|
||||||
private SAMFileReader readStream;
|
private SAMFileReader readStream;
|
||||||
|
private Iterator<SAMRecord> samReadIter = null;
|
||||||
|
|
||||||
private int nReads = 0;
|
private int nReads = 0;
|
||||||
private int nSkippedReads = 0;
|
private int nSkippedReads = 0;
|
||||||
|
|
@ -44,6 +45,7 @@ public class TraversalEngine {
|
||||||
private int nNotPrimary = 0;
|
private int nNotPrimary = 0;
|
||||||
private int nBadAlignments = 0;
|
private int nBadAlignments = 0;
|
||||||
private int nSkippedIndels = 0;
|
private int nSkippedIndels = 0;
|
||||||
|
private FileProgressTracker samReadingTracker = null;
|
||||||
|
|
||||||
public boolean DEBUGGING = false;
|
public boolean DEBUGGING = false;
|
||||||
|
|
||||||
|
|
@ -60,6 +62,31 @@ public class TraversalEngine {
|
||||||
this.rods = Arrays.asList(rods);
|
this.rods = Arrays.asList(rods);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int initialize() {
|
||||||
|
startTime = System.currentTimeMillis();
|
||||||
|
loadReference();
|
||||||
|
//testReference();
|
||||||
|
//loadReference();
|
||||||
|
try {
|
||||||
|
final FileInputStream samFileStream = new FileInputStream(readsFile);
|
||||||
|
final InputStream bufferedStream= new BufferedInputStream(samFileStream);
|
||||||
|
//final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000);
|
||||||
|
final SAMFileReader samReader = new SAMFileReader(bufferedStream, true);
|
||||||
|
samReader.setValidationStringency(strictness);
|
||||||
|
|
||||||
|
final SAMFileHeader header = samReader.getFileHeader();
|
||||||
|
System.err.println("Sort order is: " + header.getSortOrder());
|
||||||
|
|
||||||
|
samReadingTracker = new FileProgressTracker<SAMRecord>( readsFile, samReader.iterator(), samFileStream.getChannel(), 1000 );
|
||||||
|
samReadIter = samReadingTracker;
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
throw new RuntimeIOException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
public void setRegion(final String reg) { regionStr = regionStr; }
|
public void setRegion(final String reg) { regionStr = regionStr; }
|
||||||
public void setTraversalType(final String type) { traversalType = type; }
|
public void setTraversalType(final String type) { traversalType = type; }
|
||||||
public void setStrictness( final ValidationStringency s ) { strictness = s; }
|
public void setStrictness( final ValidationStringency s ) { strictness = s; }
|
||||||
|
|
@ -127,7 +154,10 @@ public class TraversalEngine {
|
||||||
if ( mustPrint || nRecords % 100000 == 0 ) {
|
if ( mustPrint || nRecords % 100000 == 0 ) {
|
||||||
final double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
|
final double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
|
||||||
final double secsPer1MReads = (elapsed * 1000000.0) / nRecords;
|
final double secsPer1MReads = (elapsed * 1000000.0) / nRecords;
|
||||||
|
|
||||||
System.out.printf("Traversed %d %s %.2f secs (%.2f secs per 1M %s)%n", nRecords, type, elapsed, secsPer1MReads, type);
|
System.out.printf("Traversed %d %s %.2f secs (%.2f secs per 1M %s)%n", nRecords, type, elapsed, secsPer1MReads, type);
|
||||||
|
|
||||||
|
System.out.printf(" -> %s%n", samReadingTracker.progressMeter());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -192,15 +222,6 @@ public class TraversalEngine {
|
||||||
// traversal by loci functions
|
// traversal by loci functions
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
protected int initialize() {
|
|
||||||
startTime = System.currentTimeMillis();
|
|
||||||
loadReference();
|
|
||||||
//testReference();
|
|
||||||
//loadReference();
|
|
||||||
readStream = initializeReadStreams();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
class locusStreamFilterFunc implements SamRecordFilter {
|
class locusStreamFilterFunc implements SamRecordFilter {
|
||||||
public boolean filterOut(SAMRecord rec) {
|
public boolean filterOut(SAMRecord rec) {
|
||||||
boolean result = false;
|
boolean result = false;
|
||||||
|
|
@ -243,7 +264,7 @@ public class TraversalEngine {
|
||||||
|
|
||||||
protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) {
|
protected <M,T> int traverseByLoci(LocusWalker<M,T> walker) {
|
||||||
walker.initialize();
|
walker.initialize();
|
||||||
FilteringIterator filterIter = new FilteringIterator(readStream.iterator(), new locusStreamFilterFunc());
|
FilteringIterator filterIter = new FilteringIterator(samReadIter, new locusStreamFilterFunc());
|
||||||
CloseableIterator<LocusIterator> iter = new LocusIterator(filterIter);
|
CloseableIterator<LocusIterator> iter = new LocusIterator(filterIter);
|
||||||
|
|
||||||
List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs();
|
List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs();
|
||||||
|
|
@ -301,14 +322,14 @@ public class TraversalEngine {
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
protected <M,R> int traverseByRead(ReadWalker<M,R> walker) {
|
protected <M,R> int traverseByRead(ReadWalker<M,R> walker) {
|
||||||
walker.initialize();
|
walker.initialize();
|
||||||
CloseableIterator<SAMRecord> iter = readStream.iterator();
|
|
||||||
R sum = walker.reduceInit();
|
R sum = walker.reduceInit();
|
||||||
boolean done = false;
|
boolean done = false;
|
||||||
while ( iter.hasNext() && ! done ) {
|
while ( samReadIter.hasNext() && ! done ) {
|
||||||
this.nRecords++;
|
this.nRecords++;
|
||||||
|
|
||||||
// actually get the read and hand it to the walker
|
// actually get the read and hand it to the walker
|
||||||
final SAMRecord read = iter.next();
|
final SAMRecord read = samReadIter.next();
|
||||||
GenomeLoc loc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart());
|
GenomeLoc loc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart());
|
||||||
|
|
||||||
if ( inLocations(loc) ) {
|
if ( inLocations(loc) ) {
|
||||||
|
|
@ -337,32 +358,4 @@ public class TraversalEngine {
|
||||||
walker.onTraveralDone();
|
walker.onTraveralDone();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// Prepare the input streams
|
|
||||||
//
|
|
||||||
//
|
|
||||||
private SAMFileReader initializeReadStreams() {
|
|
||||||
SAMFileReader reader = getSamReader(readsFile);
|
|
||||||
return reader;
|
|
||||||
}
|
|
||||||
|
|
||||||
private SAMFileReader getSamReader(final File samFile) {
|
|
||||||
try {
|
|
||||||
final InputStream samInputStream = new FileInputStream(samFile);
|
|
||||||
final InputStream bufferedStream= new BufferedInputStream(samInputStream);
|
|
||||||
//final InputStream bufferedStream= new BufferedInputStream(samInputStream, 10000000);
|
|
||||||
final SAMFileReader samReader = new SAMFileReader(bufferedStream, true);
|
|
||||||
samReader.setValidationStringency(strictness);
|
|
||||||
|
|
||||||
final SAMFileHeader header = samReader.getFileHeader();
|
|
||||||
System.err.println("Sort order is: " + header.getSortOrder());
|
|
||||||
|
|
||||||
return samReader;
|
|
||||||
}
|
|
||||||
catch (IOException e) {
|
|
||||||
throw new RuntimeIOException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,176 @@
|
||||||
|
package edu.mit.broad.sting.utils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: mdepristo
|
||||||
|
* Date: Mar 2, 2009
|
||||||
|
* Time: 2:25:18 PM
|
||||||
|
*
|
||||||
|
* This class is intended to track the reading of files composed of records of approximately equivalent
|
||||||
|
* size. It can be used to estimate time to completion, complete read, performance of io, etc.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class FileProgressTracker<T> implements Iterator<T> {
|
||||||
|
private static int DEFAULT_HISTORY_SIZE = 1000;
|
||||||
|
|
||||||
|
private int historySize = DEFAULT_HISTORY_SIZE;
|
||||||
|
private int samplingFrequency = 1000;
|
||||||
|
private File file;
|
||||||
|
private FileChannel channel;
|
||||||
|
private ArrayList<Long> history;
|
||||||
|
private long nNexts = 0;
|
||||||
|
private Iterator<T> it = null;
|
||||||
|
private long startTime = -1;
|
||||||
|
private int historyI = 0;
|
||||||
|
|
||||||
|
public FileProgressTracker( File file, Iterator<T> it, FileChannel channel, int historySize ) {
|
||||||
|
this.file = file;
|
||||||
|
this.channel = channel;
|
||||||
|
this.it = it;
|
||||||
|
this.historySize = historySize;
|
||||||
|
this.history = new ArrayList<Long>(Collections.nCopies(historySize, 0L));
|
||||||
|
startTime = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileProgressTracker( File file, Iterator<T> it, FileChannel channel ) {
|
||||||
|
this(file, it, channel, DEFAULT_HISTORY_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// iterator support
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
public boolean hasNext() { return it.hasNext(); }
|
||||||
|
|
||||||
|
public T next() {
|
||||||
|
T x = it.next();
|
||||||
|
if ( nNexts % samplingFrequency == 0 ) {
|
||||||
|
inc();
|
||||||
|
//printStatus();
|
||||||
|
}
|
||||||
|
nNexts++;
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove () {
|
||||||
|
it.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fundamental operation -- must be called ever time a record is read from the file
|
||||||
|
* Enables the system to track the relationship between file byte offsets and record
|
||||||
|
* sizes.
|
||||||
|
*/
|
||||||
|
public void inc() {
|
||||||
|
int i = historyIndex();
|
||||||
|
long pos = getPosition();
|
||||||
|
history.set(i, pos);
|
||||||
|
historyI++;
|
||||||
|
|
||||||
|
// for ( long x : history ) {
|
||||||
|
// System.out.printf("%d ", x);
|
||||||
|
// }
|
||||||
|
// System.out.printf("%n");
|
||||||
|
//
|
||||||
|
// for ( long x : recordSizes() ) {
|
||||||
|
// System.out.printf("%d ", x);
|
||||||
|
// }
|
||||||
|
// System.out.printf("%n");
|
||||||
|
}
|
||||||
|
|
||||||
|
public long nRecordsProcessed() {
|
||||||
|
return nNexts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double elapsedTimeInSecs() {
|
||||||
|
return (System.currentTimeMillis() - startTime) / 1000.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int historyIndex() {
|
||||||
|
return historyIndex(historyI);
|
||||||
|
}
|
||||||
|
public int historyIndex(long index) {
|
||||||
|
return (int)((index + historySize) % historySize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int averageRecordSize() {
|
||||||
|
return Math.round((int)Utils.average(recordSizes(), Math.min(historyI - 1, history.size())) / samplingFrequency);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double processingRate() {
|
||||||
|
return nRecordsProcessed() / elapsedTimeInSecs();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long estRecordsInFile() {
|
||||||
|
return (long)(getFileSize() / averageRecordSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
public double estFractionProgressThroughFile() {
|
||||||
|
return (1.0 * nRecordsProcessed()) / estRecordsInFile();
|
||||||
|
}
|
||||||
|
|
||||||
|
public double estTimeTotal() {
|
||||||
|
return estRecordsInFile() / processingRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
public double estTimeRemaining() {
|
||||||
|
return estTimeTotal() * ( 1 - estFractionProgressThroughFile() );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printStatus() {
|
||||||
|
System.out.printf("FileProgressTracker:%n");
|
||||||
|
System.out.printf(" -> File size is: %d%n", getFileSize());
|
||||||
|
System.out.printf(" -> File position: %d%n", getPosition());
|
||||||
|
System.out.printf(" -> Number of records processed: %d%n", nRecordsProcessed());
|
||||||
|
System.out.printf(" -> Average record size is %d%n", averageRecordSize());
|
||||||
|
System.out.printf(" -> Elapsed time in secs is %.2f%n", elapsedTimeInSecs());
|
||||||
|
System.out.printf(" -> Processing rate (records per second) %.2f%n", processingRate());
|
||||||
|
System.out.printf(" -> Estimated number of records in file %d%n", estRecordsInFile());
|
||||||
|
System.out.printf(" -> Estimated percent progress through file %.2f%n", estFractionProgressThroughFile() * 100.0);
|
||||||
|
System.out.printf(" -> Estimated time for entire processing %.2f hrs / %.2f min / %.2f sec%n", estTimeTotal() / (60*60), estTimeTotal() / (60), estTimeTotal());
|
||||||
|
System.out.printf(" -> Estimated time remaining %.2f hrs / %.2f min / %.2f sec%n", estTimeRemaining() / (60*60), estTimeRemaining() / 60, estTimeRemaining());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String progressMeter() {
|
||||||
|
return String.format("Est. %.2f%% completed, time remaining (%.2f hrs / %.2f min) of (%.2f hrs / %.2f min) total",
|
||||||
|
estFractionProgressThroughFile() * 100.0,
|
||||||
|
estTimeTotal() / (60*60), estTimeTotal() / (60),
|
||||||
|
estTimeRemaining() / (60*60), estTimeRemaining() / 60);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<Long> recordSizes() {
|
||||||
|
ArrayList<Long> sizes = new ArrayList<Long>(history);
|
||||||
|
for ( int i = 0; i < historySize; i++ ) {
|
||||||
|
sizes.set(i, history.get(historyIndex(i)) - history.get(historyIndex(i-1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// for ( long size : sizes ) {
|
||||||
|
// System.out.printf("%d ", size);
|
||||||
|
// }
|
||||||
|
// System.out.printf("%n");
|
||||||
|
|
||||||
|
return sizes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final long getPosition() {
|
||||||
|
try {
|
||||||
|
return channel.position();
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final long getFileSize() {
|
||||||
|
return file.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -72,6 +72,26 @@ public class Utils {
|
||||||
return join( separator, strings.toArray(new String[0]) );
|
return join( separator, strings.toArray(new String[0]) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static double average(List<Long> vals, int maxI) {
|
||||||
|
long sum = 0L;
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for ( long x : vals ) {
|
||||||
|
if ( i > maxI )
|
||||||
|
break;
|
||||||
|
sum += x;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.printf("Sum = %d, n = %d, avg = %f%n", sum, i, (1.0 * sum) / i);
|
||||||
|
|
||||||
|
return (1.0 * sum) / i;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static double average(List<Long> vals) {
|
||||||
|
return average(vals, vals.size());
|
||||||
|
}
|
||||||
|
|
||||||
public static void setupRefContigOrdering(final ReferenceSequenceFile refFile) {
|
public static void setupRefContigOrdering(final ReferenceSequenceFile refFile) {
|
||||||
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
|
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
|
||||||
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue