Date: Wed, 7 Mar 2012 18:47:15 -0500
Subject: [PATCH 15/26] docs updated
---
.../walkers/indels/SomaticIndelDetectorWalker.java | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index 733d32e3d..59a7bd01a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -75,7 +75,7 @@ import java.util.*;
*
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
- * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
+ * include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
@@ -92,6 +92,16 @@ import java.util.*;
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
*
+ * Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter
+ * flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the
+ * associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample
+ * somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV):
+ * COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage),
+ * INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of
+ * reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance,
+ * N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed
+ * allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites
+ * where tumor coverage and normal coverage are at least 6 and 4, respectively.
*
Input
*
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
From 858acf86165fe0466529a16ddb4d0a25647a69e8 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel
Date: Thu, 8 Mar 2012 12:29:44 -0500
Subject: [PATCH 16/26] Hidden mode in ValidationAmplicons to support ILMN
output format (same as Sequenom, with just shuffled columns)
---
.../validation/ValidationAmplicons.java | 22 ++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index e812fb53a..1d7f92242 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -134,6 +134,10 @@ public class ValidationAmplicons extends RodWalker {
@Argument(doc="Use Sequenom output format instead of regular FASTA",fullName="sqnm",required=false)
boolean sequenomOutput = false;
+ @Hidden
+ @Argument(doc="Use ILMN output format instead of regular FASTA",fullName="ilmn",required=false)
+ boolean ilmnOutput = false;
+
GenomeLoc prevInterval;
GenomeLoc allelePos;
@@ -141,6 +145,7 @@ public class ValidationAmplicons extends RodWalker {
StringBuilder sequence;
StringBuilder rawSequence;
boolean sequenceInvalid;
+ boolean isSiteSNP;
List invReason;
int indelCounter;
@@ -169,6 +174,9 @@ public class ValidationAmplicons extends RodWalker {
header.setSequenceDictionary(referenceDictionary);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
}
+
+ if (ilmnOutput)
+ out.println("Locus_Name,Target_Type,Sequence,Chromosome,Coordinate,Genome_Build_Version,Source,Source_Version,Sequence_Orientation,Plus_Minus,Force_Infinium_I");
}
public Integer reduceInit() {
@@ -234,6 +242,8 @@ public class ValidationAmplicons extends RodWalker {
}
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
} else if ( validate != null ) {
+ // record variant type in case it's needed in output format
+ isSiteSNP = (validate.isSNP());
// doesn't matter if there's a mask here too -- this is what we want to validate
if ( validate.isFiltered() ) {
logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site.");
@@ -496,13 +506,19 @@ public class ValidationAmplicons extends RodWalker {
if (!onlyOutputValidAmplicons || !sequenceInvalid) {
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
- if (!sequenomOutput)
- out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
- else {
+ if (sequenomOutput) {
seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
probeName = probeName.replace("amplicon_","a");
out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
}
+ else if (ilmnOutput) {
+ String type = isSiteSNP?"SNP":"INDEL";
+ seqIdentity = seqIdentity.replace("*",""); // no * in ref allele
+ out.printf("%s,%s,%s,%s,%d,37,1000G,ExomePhase1,Forward,Plus,FALSE%n",probeName,type,seqIdentity,allelePos.getContig(),allelePos.getStart());
+ }
+ else{
+ out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
+ }
}
}
}
From 32dee7ed9bd0c64bdd6dae5de26b751dea05750c Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Thu, 8 Mar 2012 12:52:11 -0500
Subject: [PATCH 17/26] Avoid buffer underflow in GATKBAMIndex by detecting
premature EOF in BAM indices
GATKBAMIndex would allow an extremely confusing BufferUnderflowException to be
thrown when a BAM index file was truncated or corrupt. Now, a UserException is
thrown in this situation instructing the user to re-index the BAM.
Added a unit test for this case as well.
---
.../sting/gatk/datasources/reads/GATKBAMIndex.java | 14 +++++++++++++-
.../datasources/reads/GATKBAMIndexUnitTest.java | 13 +++++++++++++
2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
index 244438a59..2bf75b035 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java
@@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileInputStream;
@@ -349,7 +350,18 @@ public class GATKBAMIndex {
private void read(final ByteBuffer buffer) {
try {
- fileChannel.read(buffer);
+ int bytesExpected = buffer.limit();
+ int bytesRead = fileChannel.read(buffer);
+
+ // We have a rigid expectation here to read in exactly the number of bytes we've limited
+ // our buffer to -- if we read in fewer bytes than this, or encounter EOF (-1), the index
+ // must be truncated or otherwise corrupt:
+ if ( bytesRead < bytesExpected ) {
+ throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " +
+ "It's likely that this file is truncated or corrupt -- " +
+ "Please try re-indexing the corresponding BAM file.",
+ mFile));
+ }
}
catch(IOException ex) {
throw new ReviewedStingException("Index: unable to read bytes from index file " + mFile);
diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java
index fde0ce62f..8cf9f7ce0 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java
@@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@@ -91,4 +92,16 @@ public class GATKBAMIndexUnitTest extends BaseTest {
Assert.assertEquals(bamIndex.getLevelSize(5),37448-4681+1);
}
+ @Test( expectedExceptions = UserException.MalformedFile.class )
+ public void testDetectTruncatedBamIndexWordBoundary() {
+ GATKBAMIndex index = new GATKBAMIndex(new File(validationDataLocation + "truncated_at_word_boundary.bai"));
+ index.readReferenceSequence(0);
+ }
+
+ @Test( expectedExceptions = UserException.MalformedFile.class )
+ public void testDetectTruncatedBamIndexNonWordBoundary() {
+ GATKBAMIndex index = new GATKBAMIndex(new File(validationDataLocation + "truncated_at_non_word_boundary.bai"));
+ index.readReferenceSequence(0);
+ }
+
}
From bc65f6326f79d09757461291c017378e6eaa6ffd Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Fri, 9 Mar 2012 12:13:53 -0500
Subject: [PATCH 18/26] Detect incomplete reads from BAM schedule file in
BAMSchedule before they become buffer underflows
This fix is similar, but distinct from the earlier fix to GATKBAMIndex. If we fail to read in
a complete 3-integer bin header from the BAM schedule file that the engine has written, throw a
ReviewedStingException (since this is our problem, not the user's) rather than allowing a
cryptic buffer underflow error to occur.
Note that this change does not fix the underlying problem in the engine, if there is one
(there may be an as-yet-undetected bug in the code that writes the bam schedule). It will
just make it easier for us to identify what's going wrong in the future.
---
.../sting/gatk/datasources/reads/BAMSchedule.java | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
index 657c70aaa..1d8879d51 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java
@@ -407,7 +407,14 @@ public class BAMSchedule implements CloseableIterator {
position(currentPosition);
// Read data.
- read(binHeader);
+ int binHeaderBytesRead = read(binHeader);
+
+ // Make sure we read in a complete bin header:
+ if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) {
+ throw new ReviewedStingException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " +
+ "The BAM schedule file is likely incomplete/corrupt.",
+ scheduleFile.getAbsolutePath(), reader.getSamFilePath()));
+ }
// Decode contents.
binHeader.flip();
From 91d10431d395389d137a9571b3fd75d579f241a3 Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Fri, 9 Mar 2012 15:11:59 -0500
Subject: [PATCH 19/26] BAMScheduler: detect contigs from the interval list
that are not in the merged BAM header's sequence dictionary
This is a quick-and-dirty patch for the null pointer error Mauricio reported earlier.
Later on we might want to address in a more general way the fact that we validate user intervals
against the reference but not against the merged BAM header produced by the engine at runtime.
---
.../sting/gatk/datasources/reads/BAMScheduler.java | 11 ++++++++++-
.../sting/utils/exceptions/UserException.java | 13 ++++---------
.../broadinstitute/sting/utils/sam/ReadUtils.java | 8 ++++++++
3 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index bcb726607..fdc3d2aa7 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -34,6 +34,8 @@ import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
@@ -245,7 +247,14 @@ public class BAMScheduler implements Iterator {
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
// we'll be using the correct contig index for the BAMs.
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
- final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
+ SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig());
+ if ( currentContigSequenceRecord == null ) {
+ throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s",
+ currentLocus.getContig(),
+ ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary())));
+ }
+
+ final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex();
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
index 6cc8008d2..d625cec20 100755
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@@ -30,6 +30,7 @@ import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
@@ -273,7 +274,7 @@ public class UserException extends ReviewedStingException {
public static class IncompatibleSequenceDictionaries extends UserException {
public IncompatibleSequenceDictionaries(String message, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) {
super(String.format("Input files %s and %s have incompatible contigs: %s.\n %s contigs = %s\n %s contigs = %s",
- name1, name2, message, name1, prettyPrintSequenceRecords(dict1), name2, prettyPrintSequenceRecords(dict2)));
+ name1, name2, message, name1, ReadUtils.prettyPrintSequenceRecords(dict1), name2, ReadUtils.prettyPrintSequenceRecords(dict2)));
}
}
@@ -284,17 +285,11 @@ public class UserException extends ReviewedStingException {
+ "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files."
+ "\nYou can use the ReorderSam utility to fix this problem: http://www.broadinstitute.org/gsa/wiki/index.php/ReorderSam"
+ "\n %s contigs = %s",
- name, name, prettyPrintSequenceRecords(dict)));
+ name, name, ReadUtils.prettyPrintSequenceRecords(dict)));
}
}
- private static String prettyPrintSequenceRecords(SAMSequenceDictionary sequenceDictionary) {
- String[] sequenceRecordNames = new String[sequenceDictionary.size()];
- int sequenceRecordIndex = 0;
- for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
- sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
- return Arrays.deepToString(sequenceRecordNames);
- }
+
public static class MissingWalker extends UserException {
public MissingWalker(String walkerName, String message) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
index d1e3ce26b..91389f0bf 100755
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
@@ -648,4 +648,12 @@ public class ReadUtils {
}
return new Pair>, HashMap>(locusToReadMap, readToLocusMap);
}
+
+ public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
+ String[] sequenceRecordNames = new String[sequenceDictionary.size()];
+ int sequenceRecordIndex = 0;
+ for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences())
+ sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
+ return Arrays.deepToString(sequenceRecordNames);
+ }
}
From 1011f3862ba7ac3d7b4cebb4f28e1603c84be6f1 Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Thu, 8 Mar 2012 08:57:29 -0500
Subject: [PATCH 21/26] CalibrateGenotypeLikelihoods now emits the position of
the variant for debugging
-- Refactored some duplicated code (FYI, code duplication = root of all evil) into shared functions
-- Added long-missing integrationtests
-- CHRIS/RYAN -- it would be very good to add an integration test covering external VCF files as I believe we rely on this functionality and it's not tested at all
---
public/java/test/org/broadinstitute/sting/BaseTest.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java
index bc4ce098b..e33f6717a 100755
--- a/public/java/test/org/broadinstitute/sting/BaseTest.java
+++ b/public/java/test/org/broadinstitute/sting/BaseTest.java
@@ -61,6 +61,8 @@ public abstract class BaseTest {
public static final String annotationDataLocation = GATKDataLocation + "Annotations/";
public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
+ public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
+ public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf";
public static final String refseqAnnotationLocation = annotationDataLocation + "refseq/";
public static final String hg18Refseq = refseqAnnotationLocation + "refGene-big-table-hg18.txt";
From 3ba2e5667c30190096504ebe6d505c78d9eee3d9 Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Thu, 8 Mar 2012 09:43:24 -0500
Subject: [PATCH 22/26] CalibrateGenotypesLikelihoods include pOfDGivenD now
---
.../java/src/org/broadinstitute/sting/utils/QualityUtils.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
index 9722f901b..7756ac71b 100755
--- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
@@ -10,6 +10,8 @@ import net.sf.samtools.SAMUtils;
*/
public class QualityUtils {
public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
+ public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
+
public final static double MIN_REASONABLE_ERROR = 0.0001;
public final static byte MAX_REASONABLE_Q_SCORE = 40;
public final static byte MIN_USABLE_Q_SCORE = 6;