Enabled ReduceByInterval

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@533 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-24 21:39:44 +00:00
parent ee2f022c71
commit 05c6679321
9 changed files with 158 additions and 10 deletions

View File

@ -351,6 +351,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
* Initialize the output streams as specified by the user.
*/
private void initializeOutputStreams() {
out = System.out;
err = System.err;
if( outErrFileName != null && (outFileName != null || errFileName != null) )
throw new IllegalArgumentException("Can't set output/error output file with either out file name or err file name");

View File

@ -418,7 +418,7 @@ public abstract class TraversalEngine {
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.info(String.format("Sort order is: " + header.getSortOrder()));
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
}
@ -506,11 +506,29 @@ public abstract class TraversalEngine {
// --------------------------------------------------------------------------------------------------------------
public <M, T> T traverse(Walker<M, T> walker) {
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
if ( hasLocations() )
l = locs;
T sum = null;
if ( hasLocations() && walker.ReduceByInterval() ) {
logger.info("Doing reduce by interval processing");
if ( ! hasLocations() )
Utils.scareUser("ReduceByInterval requested by no interval was provided");
List<Pair<GenomeLoc, T>> map = new ArrayList<Pair<GenomeLoc, T>>(locs.size());
for ( GenomeLoc loc : locs ) {
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
l.add(loc);
T intervalSum = traverse(walker, l);
sum = intervalSum;
map.add(new Pair<GenomeLoc, T>(loc, intervalSum));
}
walker.onTraversalDone(map);
} else {
ArrayList<GenomeLoc> l = new ArrayList<GenomeLoc>();
if ( hasLocations() )
l = locs;
sum = traverse(walker, l);
}
return traverse(walker, l);
printOnTraversalDone("elements", sum);
return sum;
}
public <M, T> T traverse(Walker<M, T> walker, ArrayList<GenomeLoc> locations) {

View File

@ -85,7 +85,7 @@ public class TraverseByIntervals extends TraversalEngine {
}
}
printOnTraversalDone("intervals", sum);
//printOnTraversalDone("intervals", sum);
walker.onTraversalDone(sum);
return sum;
}

View File

@ -93,7 +93,7 @@ public class TraverseByLoci extends TraversalEngine {
sum = carryWalkerOverInterval(walker, samReadIter, sum, null);
}
printOnTraversalDone("loci", sum);
//printOnTraversalDone("loci", sum);
walker.onTraversalDone(sum);
return sum;
}

View File

@ -115,7 +115,7 @@ public class TraverseByReads extends TraversalEngine {
//System.out.printf("Done? %b%n", done);
}
printOnTraversalDone("reads", sum);
//printOnTraversalDone("reads", sum);
walker.onTraversalDone(sum);
return sum;
}

View File

@ -64,7 +64,7 @@ public class TraverseByReference extends TraverseByLoci {
sum = carryWalkerOverReference(walker, sum, null);
}
printOnTraversalDone("reference", sum);
//printOnTraversalDone("reference", sum);
walker.onTraversalDone(sum);
return sum;
}

View File

@ -24,6 +24,10 @@ public class DepthOfCoverageWalker extends LocusWalker<Integer, Pair<Long, Long>
return context.getReads().size();
}
public boolean ReduceByInterval() {
return true;
}
public Pair<Long, Long> reduceInit() { return new Pair(0l,0l); }
public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum) {
@ -33,6 +37,7 @@ public class DepthOfCoverageWalker extends LocusWalker<Integer, Pair<Long, Long>
}
public void onTraversalDone(Pair<Long, Long> result) {
out.printf("Average depth of coverage is: %.2f\n", ((double)result.getFirst() / (double)result.getSecond()));
out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",
((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond());
}
}

View File

@ -1,8 +1,13 @@
package org.broadinstitute.sting.gatk.walkers;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;
import java.util.List;
import org.broadinstitute.sting.gatk.GenomeAnalysisTK;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Pair;
/**
* Created by IntelliJ IDEA.
@ -47,7 +52,34 @@ public abstract class Walker<MapType, ReduceType> {
}
public void initialize() { }
public void onTraversalDone(ReduceType result) {
out.println("[REDUCE RESULT] Traversal result is: " + result);
}
/**
* General interval reduce routine called after all of the traversals are done
* @param results
*/
public void onTraversalDone(List<Pair<GenomeLoc, ReduceType>> results) {
for ( Pair<GenomeLoc, ReduceType> result : results ) {
out.printf("[INTERVAL REDUCE RESULT] at %s ", result.getFirst());
this.onTraversalDone(result.getSecond());
}
}
/**
* Return true if your walker wants to reduce each interval separately. Default is false.
*
* If you set this flag, several things will happen.
*
* The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce
* Reduce will accumulate normally at each map unit in the interval
* However, onTraversalDone(reduce) will be called after each interval is processed.
* The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done,
* which is overloaded here to call onTraversalDone(reduce) for each location
*/
public boolean ReduceByInterval() {
return false;
}
}

View File

@ -0,0 +1,90 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.QualityUtils;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMFileHeader;
import edu.mit.broad.picard.reference.ReferenceSequence;
import java.util.ArrayList;
import java.util.Random;
import java.io.File;
/**
* ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the
* read to its home on the reference and noting the mismatch rate. It ignores reads with
* indels in them, treats high and low-quality references bases the same, and does not count
* ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads
* in short order.
*
* @author Kiran Garimella
*/
public class IOCrusherWalker extends ReadWalker<SAMRecord, ArrayList<SAMFileWriter>> {
@Argument(shortName="nWaysOut",required=false,defaultValue="1")
public int nWaysOut;
@Argument(shortName="readScaling",required=false,defaultValue="1")
public float readScaling;
@Argument(shortName="outputBase", required=true)
public String outputBase;
public long nReadsRead = 0;
public long nReadsWritten = 0;
/**
*
*/
public SAMRecord map(LocusContext context, SAMRecord read) {
nReadsRead++;
return read;
}
/**
*
*/
public ArrayList<SAMFileWriter> reduceInit() {
SAMFileWriterFactory fact = new SAMFileWriterFactory();
ArrayList<SAMFileWriter> outputs = new ArrayList<SAMFileWriter>(nWaysOut);
for ( int i = 0; i < nWaysOut; i++ ) {
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
outputs.add(fact.makeBAMWriter(header, true, new File(outputBase + "." + i + ".bam")));
}
return outputs;
}
/**
* Summarize the error rate data.
*
*/
public ArrayList<SAMFileWriter> reduce(SAMRecord read, ArrayList<SAMFileWriter> outputs) {
for ( SAMFileWriter out : outputs ) {
if ( readScaling >= 1.0 ) {
int nCopies = (int)Math.ceil(readScaling);
for ( int i = 0; i < nCopies; i++) {
out.addAlignment(read);
nReadsWritten++;
}
} else if ( Math.random() < readScaling ) {
out.addAlignment(read);
nReadsWritten++;
}
}
return outputs;
}
/**
*
*/
public void onTraversalDone(ArrayList<SAMFileWriter> outputs) {
for ( SAMFileWriter out : outputs ) {
out.close();
}
System.out.printf("Reads: read %d written %d%n", nReadsRead, nReadsWritten);
}
}