From 65c594afff5825e9f9386f21daecb61955632d3b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 16 Aug 2012 21:27:07 -0400 Subject: [PATCH 01/16] Better error message for reads that begin/end with a deletion in LIBS --- .../sting/gatk/iterators/LocusIteratorByState.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 1606c227d..f97069189 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -159,7 +159,7 @@ public class LocusIteratorByState extends LocusIterator { return stepForwardOnGenome(); } else { if (curElement != null && curElement.getOperator() == CigarOperator.D) - throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); // Reads that contain indels model the genomeOffset as the following base in the reference. Because // we fall into this else block only when indels end the read, increment genomeOffset such that the @@ -185,7 +185,7 @@ public class LocusIteratorByState extends LocusIterator { break; case D: // deletion w.r.t. the reference if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string - throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); // should be the same as N case genomeOffset++; done = true; From 53383e82ecd9e3de2d4506daee6dc4fae8e81773 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 16 Aug 2012 21:41:18 -0400 Subject: [PATCH 02/16] Hmm, not good. Fixing the math in PBT resulted in changed MD5s for integration tests that look like significant changes. I am reverting and will report this to Laurent. --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 0dcafb30a..bbd4bf92f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -645,7 +645,7 @@ public class PhaseByTransmission extends RodWalker, HashMa bestChildGenotype.clear(); bestChildGenotype.add(childGenotype.getKey()); } - else if(MathUtils.compareDoubles(configurationLikelihood, bestConfigurationLikelihood) == 0) { + else if(configurationLikelihood == bestConfigurationLikelihood) { bestFirstParentGenotype.add(firstParentGenotype.getKey()); bestSecondParentGenotype.add(secondParentGenotype.getKey()); bestChildGenotype.add(childGenotype.getKey()); From 67ebd65512d9dfc63bb85378c2833cc501ac4182 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 10:13:40 -0400 Subject: [PATCH 04/16] Bugfix for potential SEGFAULT with JNA getting execution hosts for LSF with multiple hosts --- .../sting/queue/engine/lsf/Lsf706JobRunner.scala | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index de996d187..2fbea1497 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -35,7 +35,7 @@ import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner import java.util.regex.Pattern import java.lang.StringBuffer import java.util.Date -import com.sun.jna.{Structure, StringArray, NativeLong} +import com.sun.jna.{Pointer, Structure, StringArray, NativeLong} import com.sun.jna.ptr.IntByReference /** @@ -295,9 +295,17 @@ object Lsf706JobRunner extends Logging { // the platform LSF startTimes are in seconds, not milliseconds, so convert to the java convention runner.getRunInfo.startTime = new Date(jobInfo.startTime.longValue * 1000) runner.getRunInfo.doneTime = new Date(jobInfo.endTime.longValue * 1000) - val exHostsRaw = jobInfo.exHosts.getStringArray(0) - //logger.warn("exHostsRaw = " + exHostsRaw) - val exHostsList = exHostsRaw.toSeq + + val exHostsList = + if (jobInfo.numExHosts != 1) { + // this is necessary because + val exHostsString = "multipleHosts_" + jobInfo.numExHosts + logger.debug("numExHosts = " + jobInfo.numExHosts + " != 1 for job " + runner.jobId + ", cannot safely get exhosts, setting to " + exHostsString) + List(exHostsString) + } else { + jobInfo.exHosts.getStringArray(0).toSeq + } + //logger.warn("exHostsList = " + exHostsList) val exHosts = exHostsList.reduceLeft(_ + "," + _) //logger.warn("exHosts = " + exHosts) From de3be4580652624d28f85ddd53719052ed9b07d0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 10:14:19 -0400 Subject: [PATCH 05/16] Proper function call in BCF2Decoder to validateReadBytes --- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 48 ++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index d7f59632c..05ba2aa1f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -82,7 +82,7 @@ public final class BCF2Decoder { public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { try { final int bytesRead = (int)stream.skip(blockSizeInBytes); - validateReadBytes(bytesRead, blockSizeInBytes); + validateReadBytes(bytesRead, 1, blockSizeInBytes); } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e); } @@ -316,17 +316,37 @@ public final class BCF2Decoder { } /** + * Read all bytes for a BCF record block into a byte[], and return it * - * @param inputStream - * @return + * Is smart about reading from the stream multiple times to fill the buffer, if necessary + * + * @param blockSizeInBytes number of bytes to read + * @param inputStream the stream to read from + * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream */ - private final static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { + @Requires({"blockSizeInBytes >= 0", "inputStream != null"}) + @Ensures("result != null") + private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { assert blockSizeInBytes >= 0; final byte[] record = new byte[blockSizeInBytes]; try { - final int bytesRead = inputStream.read(record); - validateReadBytes(bytesRead, blockSizeInBytes); + int bytesRead = 0; + int nReadAttempts = 0; // keep track of how many times we've read + + // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF + while ( bytesRead < blockSizeInBytes ) { + final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); + if ( read1 == -1 ) + validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); + else + bytesRead += read1; + } + + if ( nReadAttempts > 1 ) // TODO -- remove me + logger.warn("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); + + validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); } catch ( IOException e ) { throw new UserException.CouldNotReadInputFile("I/O error while reading BCF2 file", e); } @@ -334,14 +354,20 @@ public final class BCF2Decoder { return record; } - private final static void validateReadBytes(final int actuallyRead, final int expected) { + /** + * Make sure we read the right number of bytes, or throw an error + * + * @param actuallyRead + * @param nReadAttempts + * @param expected + */ + private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) { assert expected >= 0; if ( actuallyRead < expected ) { - throw new UserException.MalformedBCF2(String.format("Failed to read next complete record: %s", - actuallyRead == -1 ? - "premature end of input stream" : - String.format("expected %d bytes but read only %d", expected, actuallyRead))); + throw new UserException.MalformedBCF2( + String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", + expected, actuallyRead, nReadAttempts)); } } From 4c0f198d485a62544fe7b9115b080a9a5318ab11 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 10:17:12 -0400 Subject: [PATCH 06/16] Potential fix for GSA-484: Incomplete writing of temp BCF when running CombineVariants in parallel -- Keep reading from BCF2 input stream when read(byte[]) returns < number of needed bytes -- It's possible (I think) that the failure in GSA-484 is due to multi-threading writing/reading of BCF2 records where the underlying stream is not yet flushed so read(byte[]) returns a partial result. No loops until we get all of the needed bytes or EOF is encounted --- .../org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index fc0b3c4a9..60fcb6585 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -118,6 +118,7 @@ public final class BCF2Codec implements FeatureCodec { final int sitesBlockSize = decoder.readBlockSize(inputStream); final int genotypeBlockSize = decoder.readBlockSize(inputStream); + decoder.readNextBlock(sitesBlockSize, inputStream); decodeSiteLoc(builder); final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder); From a3d2764d11ce75cbc5258054eb7c547e2c0981a2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 10:35:25 -0400 Subject: [PATCH 07/16] Fixed: GSA-392 @arguments with just a short name get the wrong argument bindings -- Now blows up if an argument begins with -. Implementation isn't pretty, as it actually blows up during Queue extension creation with a somewhat obscure error message but at least its something. --- .../sting/commandline/ArgumentDefinition.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java index a5647ec0f..618120217 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinition.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.commandline; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.lang.annotation.Annotation; import java.util.List; @@ -147,6 +148,9 @@ public class ArgumentDefinition { this.exclusiveOf = exclusiveOf; this.validation = validation; this.validOptions = validOptions; + + validateName(shortName); + validateName(fullName); } /** @@ -192,6 +196,9 @@ public class ArgumentDefinition { else shortName = null; + validateName(shortName); + validateName(fullName); + this.ioType = ioType; this.argumentType = argumentType; this.fullName = fullName; @@ -277,4 +284,14 @@ public class ArgumentDefinition { String validation = (String)CommandLineUtils.getValue(annotation, "validation"); return validation.trim().length() > 0 ? validation.trim() : null; } + + /** + * Make sure the argument's name is valid + * + * @param name + */ + private void validateName(final String name) { + if ( name != null && name.startsWith("-") ) + throw new ReviewedStingException("Invalid argument definition: " + name + " begins with a -"); + } } From be0f8beebbb2cc5acd265c65a03f06a09794c396 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 11:21:49 -0400 Subject: [PATCH 08/16] Fixed GSA-434: GATK should generate error when gzipped FASTA is passed in. -- The GATK sort of handles this now, but only if you have the exactly correct sequence dictionary and FAI files associated with the reference. If you do, the file can be .gz. If not, the GATK will fail on creating the FAI and DICT files. Added an error message that handles this case and clearly says what to do. --- .../reference/ReferenceDataSource.java | 31 ++++++++++--------- .../sting/utils/exceptions/UserException.java | 11 +++++++ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index b131e36c1..c02ae7d99 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -62,23 +62,24 @@ public class ReferenceDataSource { * @param fastaFile Fasta file to be used as reference */ public ReferenceDataSource(File fastaFile) { - // does the fasta file exist? check that first... if (!fastaFile.exists()) throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); - File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); - File dictFile; - if (fastaFile.getAbsolutePath().endsWith("fa")) { - dictFile = new File(fastaFile.getAbsolutePath().replace(".fa", ".dict")); - } - else - dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict")); + final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz"); + + final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); + + // determine the name for the dict file + final String fastaExt = (fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta" ) + (isGzipped ? ".gz" : ""); + final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict")); /* - if index file does not exist, create it manually - */ + * if index file does not exist, create it manually + */ if (!indexFile.exists()) { + if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile); + logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath())); FSLockWithShared indexLock = new FSLockWithShared(indexFile,true); try { @@ -95,7 +96,7 @@ public class ReferenceDataSource { } catch(UserException e) { // Rethrow all user exceptions as-is; there should be more details in the UserException itself. - throw e; + throw e; } catch (Exception e) { // If lock creation succeeded, the failure must have been generating the index. @@ -114,6 +115,8 @@ public class ReferenceDataSource { * This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary */ if (!dictFile.exists()) { + if ( isGzipped ) throw new UserException.CouldNotCreateReferenceFAIorDictForGzippedRef(fastaFile); + logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath())); /* @@ -218,9 +221,9 @@ public class ReferenceDataSource { for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); shards.add(new LocusShard(parser, - readsDataSource, - Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), - null)); + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); } } return shards; diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index bda03f675..3130469e5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -340,6 +340,17 @@ public class UserException extends ReviewedStingException { } } + public static class CouldNotCreateReferenceFAIorDictForGzippedRef extends UserException { + public CouldNotCreateReferenceFAIorDictForGzippedRef(final File f) { + super("Although the GATK can process .gz reference sequences, it currently cannot create FAI " + + "or DICT files for them. In order to use the GATK with reference.fasta.gz you will need to " + + "create .dict and .fai files for reference.fasta.gz and name them reference.fasta.gz.fai and " + + "reference.dict. Potentially the easiest way to do this is to uncompress reference.fasta, " + + "run the GATK to create the .dict and .fai files, and copy them to the appropriate location. " + + "Sorry for the inconvenience."); + } + } + public static class CouldNotCreateReferenceIndexFileBecauseOfLock extends UserException.CouldNotCreateReferenceIndexFile { public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) { super(f, "could not be written because an exclusive file lock could not be obtained. " + From daa26cc64e1ae26bc2a38fded3b65f4bf589411b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 11:41:07 -0400 Subject: [PATCH 09/16] Print to logger not to System.out in CachingIndexFastaSequenceFile when profiling cache performance --- .../sting/utils/fasta/CachingIndexedFastaSequenceFile.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 44b586bcd..48706543a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -41,6 +41,8 @@ import java.util.Arrays; * Thread-safe! Uses a lock object to protect write and access to the cache. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { + protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); + /** global enable flag */ private static final boolean USE_CACHE = true; @@ -125,7 +127,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { public void printEfficiency() { // comment out to disable tracking if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) { - System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency()); + logger.info(String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency())); } } From bf6c0aaa57552f920e8c76f353a5a610f94104c3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 11:48:05 -0400 Subject: [PATCH 10/16] Fix for missing formatter in R 2.15 -- VariantCallQC now works on newest ESP call set --- .../broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R index 19567e7e6..45dacd835 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R @@ -207,7 +207,7 @@ plotVariantQC <- function(metrics, measures, requestedStrat = "Sample", if ( requestedStrat == "Sample" ) { perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale - perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "") + perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)") } else { # by AlleleCount perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs))) perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount") From 2676b7fc2e7d8ced029830bc49f2d6bcdedbe6da Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 17 Aug 2012 11:49:53 -0400 Subject: [PATCH 12/16] Put in a sanity check that MLEAC <= AN --- .../walkers/genotyper/UnifiedGenotyperEngine.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 05e12b43d..67ade390f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -442,10 +443,17 @@ public class UnifiedGenotyperEngine { // add the MLE AC and AF annotations if ( alleleCountsofMLE.size() > 0 ) { attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE); - final double AN = (double)builder.make().getCalledChrCount(); + final int AN = builder.make().getCalledChrCount(); + + // let's sanity check that we don't have an invalid MLE value in there + for ( int MLEAC : alleleCountsofMLE ) { + if ( MLEAC > AN ) + throw new ReviewedStingException(String.format("MLEAC value (%d) is larger than AN (%d) at position %s:%d", MLEAC, AN, loc.getContig(), loc.getStart())); + } + final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size()); for ( int AC : alleleCountsofMLE ) - MLEfrequencies.add((double)AC / AN); + MLEfrequencies.add((double)AC / (double)AN); attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } From 980685af16160fee6677f86021e090d71b162774 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 17 Aug 2012 14:55:17 -0400 Subject: [PATCH 13/16] Fix GSA-137: Having both DataSource.REFERENCE and DataSource.REFERENCE_BASES is confusing to end users. -- Removed REFERENCE_BASES option. You only have REFERENCE now. There's no efficiency savings for the REFERENCE_BASES option any longer, since the reference bases are loaded lazy so if you don't use them there's effectively no cost to making the RefContext that could load them. --- .../sting/gatk/traversals/TraverseReads.java | 4 +--- .../sting/gatk/walkers/ActiveRegionWalker.java | 2 +- .../sting/gatk/walkers/DataSource.java | 12 +++++++++++- .../sting/gatk/walkers/LocusWalker.java | 2 +- .../sting/gatk/walkers/ReadWalker.java | 2 +- .../broadinstitute/sting/gatk/walkers/RefWalker.java | 2 +- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- 7 files changed, 17 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index cb094d29b..d29e9a5f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -75,8 +75,6 @@ public class TraverseReads extends TraversalEngine,Read if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - boolean needsReferenceBasesP = WalkerManager.isRequired(walker, DataSource.REFERENCE_BASES); - ReadView reads = new ReadView(dataProvider); ReadReferenceView reference = new ReadReferenceView(dataProvider); @@ -91,7 +89,7 @@ public class TraverseReads extends TraversalEngine,Read ReferenceContext refContext = null; // get the array of characters for the reference sequence, since we're a mapped read - if (needsReferenceBasesP && !read.getReadUnmappedFlag() && dataProvider.hasReference()) + if (!read.getReadUnmappedFlag() && dataProvider.hasReference()) refContext = reference.getReferenceContext(read); // update the number of reads we've seen diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index aba508b3e..cbe791353 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -28,7 +28,7 @@ import java.util.List; */ @By(DataSource.READS) -@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) +@Requires({DataSource.READS, DataSource.REFERENCE}) @PartitionBy(PartitionType.READ) @ActiveRegionExtension(extension=50,maxRegion=1500) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java index a152ab137..1f93c67a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/DataSource.java @@ -16,8 +16,18 @@ package org.broadinstitute.sting.gatk.walkers; * Allow user to choose between a number of different data sources. */ public enum DataSource { + /** + * Does this walker require read (BAM) data to work? + */ READS, + + /** + * Does this walker require reference data to work? + */ REFERENCE, - REFERENCE_BASES, // Do I actually need the reference bases passed to the walker? + + /** + * Does this walker require reference order data (VCF) to work? + */ REFERENCE_ORDERED_DATA } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index 2a92d8831..3b18dda44 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -16,7 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; * To change this template use File | Settings | File Templates. */ @By(DataSource.READS) -@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +@Requires({DataSource.READS,DataSource.REFERENCE}) @PartitionBy(PartitionType.LOCUS) @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) @RemoveProgramRecords diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index 8933bd73e..77e3af93f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -12,7 +12,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * Time: 2:52:28 PM * To change this template use File | Settings | File Templates. */ -@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) +@Requires({DataSource.READS, DataSource.REFERENCE}) @PartitionBy(PartitionType.READ) public abstract class ReadWalker extends Walker { public boolean requiresOrderedReads() { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java index 1d3debb48..45bd14d4e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/RefWalker.java @@ -8,7 +8,7 @@ package org.broadinstitute.sting.gatk.walkers; * To change this template use File | Settings | File Templates. */ @By(DataSource.REFERENCE) -@Requires({DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +@Requires({DataSource.REFERENCE}) @Allows(DataSource.REFERENCE) public abstract class RefWalker extends LocusWalker { } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 3f35cf8e8..e45cad971 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -107,7 +107,7 @@ import java.util.ArrayList; @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By(DataSource.READS) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file -@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) // filter out all reads with zero or unavailable mapping quality +@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta public class BaseRecalibrator extends LocusWalker implements TreeReducible { @ArgumentCollection From d16cb68539af7011dfafe55f3eec277563b5ac9d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 17 Aug 2012 14:20:15 -0400 Subject: [PATCH 14/16] Updated and more thorough version of the BadCigar read filter * No reads with Hard/Soft clips in the middle of the cigar * No reads starting with deletions (with or without preceding clips) * No reads ending in deletions (with or without follow-up clips) * No reads that are fully hard or soft clipped * No reads that have consecutive indels in the cigar (II, DD, ID or DI) Also added systematic test for good cigars and iterative test for bad cigars. --- .../sting/gatk/filters/BadCigarFilter.java | 94 +++++++++++---- .../gatk/filters/BadCigarFilterUnitTest.java | 74 ++++++------ .../utils/clipping/ReadClipperTestUtils.java | 109 +++++++++++++++--- 3 files changed, 210 insertions(+), 67 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java index 9a1455859..cda7392ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java @@ -29,9 +29,19 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import java.util.Iterator; + /** * Filter out reads with wonky cigar strings. * + * - No reads with Hard/Soft clips in the middle of the cigar + * - No reads starting with deletions (with or without preceding clips) + * - No reads ending in deletions (with or without follow-up clips) + * - No reads that are fully hard or soft clipped + * - No reads that have consecutive indels in the cigar (II, DD, ID or DI) + * + * ps: apparently an empty cigar is okay... + * * @author ebanks * @version 0.1 */ @@ -40,28 +50,72 @@ public class BadCigarFilter extends ReadFilter { public boolean filterOut(final SAMRecord rec) { final Cigar c = rec.getCigar(); - if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad - boolean previousElementWasIndel = false; - CigarOperator lastOp = c.getCigarElement(0).getOperator(); - - if (lastOp == CigarOperator.D) // filter out reads starting with deletion - return true; - - for (CigarElement ce : c.getCigarElements()) { - CigarOperator op = ce.getOperator(); - if (op == CigarOperator.D || op == CigarOperator.I) { - if (previousElementWasIndel) - return true; // filter out reads with adjacent I/D - - previousElementWasIndel = true; - } - else // this is a regular base (match/mismatch/hard or soft clip) - previousElementWasIndel = false; // reset the previous element - - lastOp = op; + // if there is no Cigar then it can't be bad + if( c.isEmpty() ) { + return false; } - return lastOp == CigarOperator.D; + Iterator elementIterator = c.getCigarElements().iterator(); + + CigarOperator firstOp = CigarOperator.H; + while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) { + CigarOperator op = elementIterator.next().getOperator(); + + // No reads with Hard/Soft clips in the middle of the cigar + if (firstOp != CigarOperator.H && op == CigarOperator.H) { + return true; + } + firstOp = op; + } + + // No reads starting with deletions (with or without preceding clips) + if (firstOp == CigarOperator.D) { + return true; + } + + boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S); + boolean previousElementWasIndel = firstOp == CigarOperator.I; + CigarOperator lastOp = firstOp; + CigarOperator previousOp = firstOp; + + while (elementIterator.hasNext()) { + CigarOperator op = elementIterator.next().getOperator(); + + if (op != CigarOperator.S && op != CigarOperator.H) { + + // No reads with Hard/Soft clips in the middle of the cigar + if (previousOp == CigarOperator.S || previousOp == CigarOperator.H) + return true; + + lastOp = op; + + if (!hasMeaningfulElements && op.consumesReadBases()) { + hasMeaningfulElements = true; + } + + if (op == CigarOperator.I || op == CigarOperator.D) { + + // No reads that have consecutive indels in the cigar (II, DD, ID or DI) + if (previousElementWasIndel) { + return true; + } + previousElementWasIndel = true; + } + else { + previousElementWasIndel = false; + } + } + // No reads with Hard/Soft clips in the middle of the cigar + else if (op == CigarOperator.S && previousOp == CigarOperator.H) { + return true; + } + + previousOp = op; + } + + // No reads ending in deletions (with or without follow-up clips) + // No reads that are fully hard or soft clipped + return lastOp == CigarOperator.D || !hasMeaningfulElements; } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java index 333d35641..ff918db68 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/filters/BadCigarFilterUnitTest.java @@ -1,11 +1,14 @@ package org.broadinstitute.sting.gatk.filters; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import net.sf.samtools.Cigar; +import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.List; + /** * Checks that the Bad Cigar filter works for all kinds of wonky cigars * @@ -14,6 +17,29 @@ import org.testng.annotations.Test; */ public class BadCigarFilterUnitTest { + public static final String[] BAD_CIGAR_LIST = { + "2D4M", // starting with multiple deletions + "4M2D", // ending with multiple deletions + "3M1I1D", // adjacent indels AND ends in deletion + "1M1I1D2M", // adjacent indels I->D + "1M1D2I1M", // adjacent indels D->I + "1M1I2M1D", // ends in single deletion with insertion in the middle + "4M1D", // ends in single deletion + "1D4M", // starts with single deletion + "2M1D1D2M", // adjacent D's + "1M1I1I1M", // adjacent I's + "1H1D4M", // starting with deletion after H + "1S1D3M", // starting with deletion after S + "1H1S1D3M", // starting with deletion after HS + "4M1D1H", // ending with deletion before H + "3M1D1S", // ending with deletion before S + "3M1D1S1H", // ending with deletion before HS + "10M2H10M", // H in the middle + "10M2S10M", // S in the middle + "1H1S10M2S10M1S1H", // deceiving S in the middle + "1H1S10M2H10M1S1H" // deceiving H in the middle + }; + BadCigarFilter filter; @BeforeClass @@ -21,40 +47,20 @@ public class BadCigarFilterUnitTest { filter = new BadCigarFilter(); } - @Test + @Test(enabled = true) public void testWonkyCigars () { - byte[] bases = {'A', 'A', 'A', 'A'}; - byte[] quals = {30, 30, 30, 30}; - GATKSAMRecord read; - // starting with multiple deletions - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M"); - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + for (String cigarString : BAD_CIGAR_LIST) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } + } - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); - - read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's - Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + @Test(enabled = true) + public void testGoodCigars() { + List cigarList = ReadClipperTestUtils.generateCigarList(10); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + Assert.assertFalse(filter.filterOut(read), read.getCigarString()); + } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index baa2f6218..208c14fbd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -4,6 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -37,17 +38,22 @@ public class ReadClipperTestUtils { return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); } - /** - * This function generates every valid permutation of cigar strings with a given length. - * - * A valid cigar object obeys the following rules: - * - No Hard/Soft clips in the middle of the read - * - No deletions in the beginning / end of the read - * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) - * - * @param maximumLength the maximum number of elements in the cigar - * @return a list with all valid Cigar objects - */ + public static GATKSAMRecord makeReadFromCigar(String cigarString) { + return makeReadFromCigar(cigarFromString(cigarString)); + } + + /** + * This function generates every valid permutation of cigar strings with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * - No consecutive I/D elements + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ public static List generateCigarList(int maximumLength) { int numCigarElements = cigarElements.length; LinkedList cigarList = new LinkedList(); @@ -137,7 +143,10 @@ public class ReadClipperTestUtils { CigarElement lastElement = null; int lastElementLength = 0; for (CigarElement cigarElement : rawCigar.getCigarElements()) { - if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) + if (lastElement != null && + ((lastElement.getOperator() == cigarElement.getOperator()) || + (lastElement.getOperator() == CigarOperator.I && cigarElement.getOperator() == CigarOperator.D) || + (lastElement.getOperator() == CigarOperator.D && cigarElement.getOperator() == CigarOperator.I))) lastElementLength += cigarElement.getLength(); else { @@ -191,7 +200,7 @@ public class ReadClipperTestUtils { /** * Checks whether or not the read has any cigar element that is not H or S * - * @param read + * @param read the read * @return true if it has any M, I or D, false otherwise */ public static boolean readHasNonClippedBases(GATKSAMRecord read) { @@ -201,5 +210,79 @@ public class ReadClipperTestUtils { return false; } + public static Cigar cigarFromString(String cigarString) { + Cigar cigar = new Cigar(); + + boolean isNumber = false; + int number = 0; + for (int i = 0; i < cigarString.length(); i++) { + char x = cigarString.charAt(i); + + if (x >= '0' && x <='9') { + if (isNumber) { + number *= 10; + } + else { + isNumber = true; + } + number += x - '0'; + } + + else { + CigarElement e; + switch (x) { + case 'M': + case 'm': + e = new CigarElement(number, CigarOperator.M); + break; + + case 'I': + case 'i': + e = new CigarElement(number, CigarOperator.I); + break; + + case 'D': + case 'd': + e = new CigarElement(number, CigarOperator.D); + break; + + case 'S': + case 's': + e = new CigarElement(number, CigarOperator.S); + break; + + case 'N': + case 'n': + e = new CigarElement(number, CigarOperator.N); + break; + + case 'H': + case 'h': + e = new CigarElement(number, CigarOperator.H); + break; + + case 'P': + case 'p': + e = new CigarElement(number, CigarOperator.P); + break; + + case '=': + e = new CigarElement(number, CigarOperator.EQ); + break; + + case 'X': + case 'x': + e = new CigarElement(number, CigarOperator.X); + break; + + default: + throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")"); + } + cigar.add(e); + } + } + return cigar; + } + }