diff --git a/ivy.xml b/ivy.xml
index f5ff15c30..06296c6b4 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -41,7 +41,7 @@
-
+
@@ -66,13 +66,13 @@
-
+
-
+
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
index f1ffa121b..34ac17f49 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
@@ -74,16 +74,16 @@ public class LocusIteratorByState extends LocusIterator {
static private class SAMRecordState {
SAMRecord read;
- int readOffset = -1; // how far are we offset from the start of the read bases?
- int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
+ int readOffset = -1; // how far are we offset from the start of the read bases?
+ int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
Cigar cigar = null;
int cigarOffset = -1;
CigarElement curElement = null;
int nCigarElements = 0;
- // how far are we into a single cigarElement
- int cigarElementCounter = -1;
+
+ int cigarElementCounter = -1; // how far are we into a single cigarElement
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
@@ -93,19 +93,19 @@ public class LocusIteratorByState extends LocusIterator {
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
- boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
- // the only purpose of this flag is to shield away a few additional lines of code
- // when extended piles are not needed, it may not be even worth it...
+ boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
+ // the only purpose of this flag is to shield away a few additional lines of code
+ // when extended piles are not needed, it may not be even worth it...
- byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
- int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
- byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
- // current base on the ref. We use a counter-like variable here since clearing the indel event is
- // delayed by one base, so we need to remember how long ago we have seen the actual event
+ byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
+ int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
+ byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
+ // current base on the ref. We use a counter-like variable here since clearing the indel event is
+ // delayed by one base, so we need to remember how long ago we have seen the actual event
- int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
- // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
- // we cache it here mainly for convenience
+ int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
+ // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
+ // we cache it here mainly for convenience
public SAMRecordState(SAMRecord read, boolean extended) {
@@ -176,6 +176,10 @@ public class LocusIteratorByState extends LocusIterator {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
+ public CigarOperator peekForwardOnGenome() {
+ return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ).getOperator();
+ }
+
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
@@ -237,6 +241,8 @@ public class LocusIteratorByState extends LocusIterator {
readOffset += curElement.getLength();
break;
case D: // deletion w.r.t. the reference
+ if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
+ throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
if (generateExtendedEvents) {
if (cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
@@ -399,9 +405,9 @@ public class LocusIteratorByState extends LocusIterator {
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
final int eventLength = state.getEventLength();
-// if (op != CigarOperator.N) // N's are never added to any pileup
-// continue;
-//
+ if (op == CigarOperator.N) // N's are never added to any pileup
+ continue;
+
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
size++;
ExtendedEventPileupElement pileupElement;
@@ -409,27 +415,26 @@ public class LocusIteratorByState extends LocusIterator {
nDeletions++;
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
- }
+ }
else { // Insertion event
nInsertions++;
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
}
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
indelPile.add(pileupElement);
}
- // this read has no indel associated with the previous position on the ref. Criteria to include in the pileup are:
- // we only add reads that are not N's
- // we only include deletions to the pileup if the walker requests it
- else if ( (op != CigarOperator.N) && (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci())) {
+ // this read has no indel so add it to the pileup as a NOEVENT:
+ // a deletion that didn't start here (therefore, not an extended event)
+ // we add (mis)matches as no events.
+ else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) {
size++;
indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset));
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
}
-
-
- if (state.getRead().getMappingQuality() == 0)
- nMQ0Reads++;
-
}
if (indelPile.size() != 0)
@@ -455,26 +460,27 @@ public class LocusIteratorByState extends LocusIterator {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
+ final CigarOperator nextOp = state.peekForwardOnGenome(); // next cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
- final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
- if (read.getMappingQuality() == 0)
- nMQ0Reads++;
-
if (op == CigarOperator.D) {
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
- int leftAlignedStart = (eventStartOffset < 0) ? readOffset : eventStartOffset;
- pile.add(new PileupElement(read, leftAlignedStart, true));
+ pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.I, false));
size++;
nDeletions++;
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
}
- } else {
+ }
+ else {
if (!filterBaseInRead(read, location.getStart())) {
- pile.add(new PileupElement(read, readOffset, false));
+ pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.I, op == CigarOperator.S));
size++;
+ if (read.getMappingQuality() == 0)
+ nMQ0Reads++;
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index f09865537..e8627ef4c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -264,22 +264,8 @@ public class GATKRunReport {
}
}
- /**
- * Opens the destination file and writes a gzipped version of the XML report there.
- *
- * @param destination
- * @throws IOException
- */
- private void postReportToFile(File destination) throws IOException {
- BufferedOutputStream out =
- new BufferedOutputStream(
- new GZIPOutputStream(
- new FileOutputStream(destination)));
- try {
- postReportToStream(out);
- } finally {
- out.close();
- }
+ private final String getKey() {
+ return getID() + ".report.xml.gz";
}
/**
@@ -288,16 +274,21 @@ public class GATKRunReport {
* That is, postReport() is guarenteed not to fail for any reason.
*/
private File postReportToLocalDisk(File rootDir) {
- String filename = getID() + ".report.xml.gz";
- File file = new File(rootDir, filename);
+ final String filename = getKey();
+ final File destination = new File(rootDir, filename);
+
try {
- postReportToFile(file);
- logger.debug("Wrote report to " + file);
- return file;
+ final BufferedOutputStream out = new BufferedOutputStream(
+ new GZIPOutputStream(
+ new FileOutputStream(destination)));
+ postReportToStream(out);
+ out.close();
+ logger.debug("Wrote report to " + destination);
+ return destination;
} catch ( Exception e ) {
// we catch everything, and no matter what eat the error
exceptDuringRunReport("Couldn't read report file", e);
- file.delete();
+ destination.delete();
return null;
}
}
@@ -305,42 +296,46 @@ public class GATKRunReport {
private void postReportToAWSS3() {
// modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html
this.hostName = Utils.resolveHostname(); // we want to fill in the host name
- File localFile = postReportToLocalDisk(new File("./"));
- logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
- if ( localFile != null ) { // we succeeded in creating the local file
- localFile.deleteOnExit();
- try {
- // stop us from printing the annoying, and meaningless, mime types warning
- Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
- mimeTypeLogger.setLevel(Level.FATAL);
+ final String key = getKey();
+ logger.debug("Generating GATK report to AWS S3 with key " + key);
+ try {
+ // create an byte output stream so we can capture the output as a byte[]
+ final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096);
+ final OutputStream outputStream = new GZIPOutputStream(byteStream);
+ postReportToStream(outputStream);
+ outputStream.close();
+ final byte[] report = byteStream.toByteArray();
- // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
- // are stored in an AWSCredentials object:
+ // stop us from printing the annoying, and meaningless, mime types warning
+ Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
+ mimeTypeLogger.setLevel(Level.FATAL);
- // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
- String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
- String awsSecretKey = "uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA"; // GATK AWS user
- AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
+ // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
+ // are stored in an AWSCredentials object:
- // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
- // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
- S3Service s3Service = new RestS3Service(awsCredentials);
+ // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
+ String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
+ String awsSecretKey = "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"; // GATK AWS user
+ AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
- // Create an S3Object based on a file, with Content-Length set automatically and
- // Content-Type set based on the file's extension (using the Mimetypes utility class)
- S3Object fileObject = new S3Object(localFile);
- //logger.info("Created S3Object" + fileObject);
- //logger.info("Uploading " + localFile + " to AWS bucket");
- S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
- logger.debug("Uploaded to AWS: " + s3Object);
- logger.info("Uploaded run statistics report to AWS S3");
- } catch ( S3ServiceException e ) {
- exceptDuringRunReport("S3 exception occurred", e);
- } catch ( NoSuchAlgorithmException e ) {
- exceptDuringRunReport("Couldn't calculate MD5", e);
- } catch ( IOException e ) {
- exceptDuringRunReport("Couldn't read report file", e);
- }
+ // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
+ // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
+ S3Service s3Service = new RestS3Service(awsCredentials);
+
+ // Create an S3Object based on a file, with Content-Length set automatically and
+ // Content-Type set based on the file's extension (using the Mimetypes utility class)
+ S3Object fileObject = new S3Object(key, report);
+ //logger.info("Created S3Object" + fileObject);
+ //logger.info("Uploading " + localFile + " to AWS bucket");
+ S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
+ logger.debug("Uploaded to AWS: " + s3Object);
+ logger.info("Uploaded run statistics report to AWS S3");
+ } catch ( S3ServiceException e ) {
+ exceptDuringRunReport("S3 exception occurred", e);
+ } catch ( NoSuchAlgorithmException e ) {
+ exceptDuringRunReport("Couldn't calculate MD5", e);
+ } catch ( IOException e ) {
+ exceptDuringRunReport("Couldn't read report file", e);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index f5e936a09..562a6d1d0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -14,10 +14,7 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import java.util.ArrayList;
-import java.util.LinkedHashSet;
-import java.util.LinkedList;
-import java.util.Queue;
+import java.util.*;
/**
* Created by IntelliJ IDEA.
@@ -54,7 +51,8 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList();
+ final ArrayList isActiveList = new ArrayList();
+ GenomeLoc firstIsActiveStart = null;
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
@@ -64,25 +62,26 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList );
+ final ArrayList activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
if( walker.activeRegionOutStream == null ) {
workQueue.addAll( activeRegions );
@@ -137,14 +135,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList ) {
+ // band-pass filter the list of isActive probabilities and turn into active regions
+ private ArrayList integrateActiveList( final ArrayList activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension ) {
+
+ final double ACTIVE_PROB_THRESHOLD = 0.2;
final ArrayList returnList = new ArrayList();
if( activeList.size() == 0 ) {
return returnList;
} else if( activeList.size() == 1 ) {
- returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()),
- activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) );
+ returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()),
+ activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) );
return returnList;
} else {
- ActiveRegion prevLocus = activeList.get(0);
- ActiveRegion startLocus = prevLocus;
- for( final ActiveRegion thisLocus : activeList ) {
- if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) {
- returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
- prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) );
- startLocus = thisLocus;
+ final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
+ final double[] filteredProbArray = new double[activeProbArray.length];
+ final int FILTER_SIZE = 10;
+ final int MAX_ACTIVE_REGION = 200;
+ for( int iii = 0; iii < activeProbArray.length; iii++ ) {
+ double maxVal = 0;
+ for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE); jjj++ ) {
+ if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
}
- prevLocus = thisLocus;
+ filteredProbArray[iii] = maxVal;
}
- // output the last region if necessary
- if( startLocus != prevLocus ) {
- returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
- prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) );
+
+ boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD;
+ int curStart = 0;
+ for(int iii = 1; iii < filteredProbArray.length; iii++ ) {
+ final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD;
+ if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
+ returnList.add( new ActiveRegion(
+ engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)),
+ curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
+ curStatus = thisStatus;
+ curStart = iii;
+ }
+ }
+ if( curStart != filteredProbArray.length-1 ) {
+ returnList.add( new ActiveRegion(
+ engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)),
+ curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
}
return returnList;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
index 98308ee11..244870c78 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
@@ -73,8 +73,8 @@ public abstract class ActiveRegionWalker extends Walker